In [1]:
import pandas as pd
import sqlite3
import requests
import time
import json
import tarfile
import os
import re
from tqdm import tqdm
from typing import List, Dict, Optional, Tuple
from urllib.parse import urlencode
import logging

In [2]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
def setup_database(conn):
    """Create all necessary tables in the SQLite database."""
    cursor = conn.cursor()
    
    # GO annotations table
    # geneProductId, goId, goName, goAspect, goEvidence, evidenceCode, qualifier, assignedBy, date
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS uniprot_quickgo_annotations (
            uniprot_id TEXT NOT NULL,
            go_id TEXT NOT NULL,
            go_name TEXT,
            go_aspect TEXT,
            go_evidence TEXT,
            evidence_code TEXT,
            qualifier TEXT,
            assigned_by TEXT,
            date_created TEXT,
            UNIQUE(uniprot_id, go_id)
        )
    ''')
    
    # InterPro to GO mapping table
    # InterPro:IPR000009 Protein phosphatase 2A regulatory subunit PR55 > GO:protein phosphatase type 2A complex ; GO:0000159
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS interpro_go_mapping (
            interpro_id TEXT NOT NULL,
            interpro_description TEXT,
            go_id TEXT NOT NULL,
            go_name TEXT,
            UNIQUE(interpro_id, go_id)
        )
    ''')
    
    # PROSITE entries table
    # Uniprot entry|uniprot id/sequence range: Prosite entry name| prosite entry id/score. aligned sequence.
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS prosite_entries (
            prosite_id TEXT NOT NULL,
            uniprot_id TEXT NOT NULL,
            prosite_name TEXT,
            uniprot_name TEXT,
            sequence_start INTEGER,
            sequence_end INTEGER,
            score REAL,
            sequence TEXT,
            aligned_sequence TEXT,
            UNIQUE(prosite_id, uniprot_id, sequence_start, sequence_end)
        )
    ''')

    # InterPro to GO mapping table
    # PROSITE:PS00027 HOMEOBOX_1 > GO:regulation of DNA-templated transcription ; GO:0006355
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS prosite_go_mapping (
            prosite_id TEXT NOT NULL,
            prosite_description TEXT,
            go_id TEXT NOT NULL,
            go_name TEXT,
            UNIQUE(prosite_id, go_id)
        )
    ''')
    
    conn.commit()
    logger.info("Database tables created successfully")

In [None]:
def fetch_go_annotations(conn, uniprot_ids: List[str], batch_size: int = 100):
    """
    Fetch GO annotations for UniProt IDs using QuickGO API.
    
    Args:
        uniprot_ids: List of UniProt IDs
        batch_size: Number of IDs to process in each batch
    """
    base_url = "https://www.ebi.ac.uk/QuickGO/services/annotation/search"
    
    # Process in batches to avoid overwhelming the API
    for i in range(0, len(uniprot_ids), batch_size):
        page = 1
        while True:
            batch = uniprot_ids[i:i + batch_size]
            logger.info(f"Processing batch {i//batch_size + 1}, IDs {i+1}-{min(i+batch_size, len(uniprot_ids))}")
            
            # Prepare API parameters
            params = {
                'geneProductId': ','.join(batch),
                'limit': 200,  # Maximum results per request
                'includeFields': 'goName',
                'page': page
            }
            
            try:
                response = requests.get(base_url, params=params, timeout=30)
                response.raise_for_status()
                
                data = response.json()
                annotations = data.get('results', [])
                
                # Insert annotations into database
                cursor = conn.cursor()
                for annotation in annotations:
                    geneProductId = annotation.get('geneProductId', '')
                    geneProductId = geneProductId.split(":")[1] if ':' in geneProductId else geneProductId
                    cursor.execute('''
                        INSERT OR IGNORE INTO uniprot_quickgo_annotations 
                        (uniprot_id, go_id, go_name, go_aspect, go_evidence, evidence_code, qualifier, assigned_by, date_created)
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
                        ''', (
                        geneProductId,
                        annotation.get('goId', ''),
                        annotation.get('goName', ''),
                        annotation.get('goAspect', ''),
                        annotation.get('goEvidence', ''),
                        annotation.get('evidenceCode', ''),
                        annotation.get('qualifier', ''),
                        annotation.get('assignedBy', ''),
                        annotation.get('date', '')
                    ))

                conn.commit()
                logger.info(f"Inserted {len(annotations)} annotations from batch")
                
                # Rate limiting to be respectful to the API
                time.sleep(0.1)

                page += 1
                
            except requests.RequestException as e:
                logger.error(f"Error fetching batch {i//batch_size + 1}: {e}")
                break
            except Exception as e:
                logger.error(f"Unexpected error in batch {i//batch_size + 1}: {e}")
                break

In [63]:
def parse_interpro_go_mapping(conn, file_url: str = "https://current.geneontology.org/ontology/external2go/interpro2go"):
    """
    Download and parse InterPro to GO mapping file.
    
    Args:
        file_url: URL of the interpro2go file
    """
    logger.info("Downloading InterPro to GO mapping file")
    
    try:
        response = requests.get(file_url, timeout=60)
        response.raise_for_status()
        
        lines = response.text.strip().split('\n')
        cursor = conn.cursor()
        
        # Pattern to parse lines: InterPro:IPR000009 Description > GO:description ; GO:0000159
        pattern = r'InterPro:(IPR\d+)\s+(.+?)\s+>\s+GO:(.+?)\s+;\s+(GO:\d+)'
        
        parsed_count = 0
        for line in tqdm(lines):
            line = line.strip()
            if line.startswith('!') or not line:  # Skip comments and empty lines
                continue
            
            match = re.match(pattern, line)
            if match:
                interpro_id = match.group(1)
                interpro_desc = match.group(2).strip()
                go_desc = match.group(3).strip()
                go_term = match.group(4)

                # InterPro to GO mapping table
                cursor.execute('''
                    INSERT OR IGNORE INTO interpro_go_mapping 
                    (interpro_id, interpro_description, go_id, go_name)
                    VALUES (?, ?, ?, ?)
                ''', (interpro_id, interpro_desc, go_term, go_desc))
                
                parsed_count += 1
            else:
                logger.warning(f"Could not parse line: {line}")
        
        conn.commit()
        logger.info(f"Parsed and inserted {parsed_count} InterPro-GO mappings")
        
    except requests.RequestException as e:
        logger.error(f"Error downloading InterPro mapping file: {e}")
    except Exception as e:
        logger.error(f"Error parsing InterPro mapping: {e}")

In [114]:
def download_and_extract_prosite(url: str = "https://ftp.expasy.org/databases/prosite/prosite_alignments.tar.gz", 
                                extract_dir: str = "/cta/share/users/prosite_data"):
    """
    Download and extract PROSITE alignments.
    
    Args:
        url: URL of the PROSITE alignments tar.gz file
        extract_dir: Directory to extract files to
    """
    logger.info("Downloading PROSITE alignments")
    
    try:
        # Download the tar.gz file
        response = requests.get(url, stream=True, timeout=300)
        response.raise_for_status()
        
        tar_filename = "prosite_alignments.tar.gz"
        with open(tar_filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        logger.info("Extracting PROSITE alignments")
        
        # Extract the tar.gz file
        with tarfile.open(tar_filename, 'r:gz') as tar:
            tar.extractall(path=extract_dir)
        
        # Clean up the downloaded file
        os.remove(tar_filename)
        
        return extract_dir
        
    except Exception as e:
        logger.error(f"Error downloading/extracting PROSITE data: {e}")
        return None

def parse_prosite_entry(filepath: str) -> List[Dict]:
    """
    Parse a single PROSITE MSA file.
    
    Args:
        filepath: Path to the .msa file
        
    Returns:
        List of parsed entries
    """
    entries = []
    
    try:
        with open(filepath, 'r') as f:
            content = f.read()
        
        # Split by entries (starting with >)
        entry_blocks = re.split(r'^>', content, flags=re.MULTILINE)[1:]  # Skip first empty element
        
        for block in entry_blocks:
            lines = block.strip().split('\n')
            if not lines:
                continue
            
            # Parse header line
            header = lines[0]
            # Pattern with optional score: 
            # With score: GIT2_RAT|Q66H91/1-124: ARFGAP|PS50115/23.341
            # Without score: MIC8_TOXGO|Q9BIM7/229-240: ASX_HYDROXYL|PS00010
            header_pattern = r'([^|]+)\|([^/]+)/(\d+)-(\d+):\s*([^|]+)\|([^/]+)(?:/([0-9.]+))?'
            
            match = re.match(header_pattern, header)
            if match:
                uniprot_name = match.group(1)
                uniprot_id = match.group(2)
                seq_start = int(match.group(3))
                seq_end = int(match.group(4))
                prosite_name = match.group(5)
                prosite_id = match.group(6)
                score = float(match.group(7)) if match.group(7) else None
                
                # Extract sequence (remove gaps and dots, join all sequence lines)
                sequence_lines = lines[1:]
                aligned_sequence = ''.join(sequence_lines).replace(' ', '')
                sequence = ''.join(sequence_lines).replace('-', '').replace('.', '').replace(' ', '').upper()
                
                entries.append({
                    'uniprot_name': uniprot_name,
                    'uniprot_id': uniprot_id,
                    'sequence_start': seq_start,
                    'sequence_end': seq_end,
                    'prosite_name': prosite_name,
                    'prosite_id': prosite_id,
                    'score': score,
                    'sequence': sequence,
                    'aligned_sequence': aligned_sequence
                })
            else:
                logger.warning(f"Could not parse header: {header}")

    except Exception as e:
        logger.error(f"Error parsing file {filepath}: {e}")
    
    return entries

def process_prosite_data(conn, extract_dir: str = "/cta/share/users/prosite_data/prosite_alignments"):
    """
    Process all PROSITE MSA files and insert into database.
    
    Args:
        extract_dir: Directory containing extracted PROSITE files
    """
    if not os.path.exists(extract_dir):
        logger.error(f"Extract directory {extract_dir} does not exist")
        return
    
    cursor = conn.cursor()
    total_entries = 0
    
    # Find all .msa files
    msa_files = [f for f in os.listdir(extract_dir) if f.endswith('.msa')]
    logger.info(f"Found {len(msa_files)} PROSITE MSA files")
    
    for i, filename in enumerate(msa_files):
        if i % 100 == 0:
            logger.info(f"Processing file {i+1}/{len(msa_files)}: {filename}")
        
        filepath = os.path.join(extract_dir, filename)
        entries = parse_prosite_entry(filepath) 

        # Insert entries into database
        for entry in entries:
            cursor.execute('''
                INSERT OR IGNORE INTO prosite_entries 
                (prosite_id, uniprot_id, prosite_name, uniprot_name, sequence_start, sequence_end, 
                    score, sequence, aligned_sequence)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                entry['prosite_id'],
                entry['uniprot_id'],
                entry['prosite_name'],
                entry['uniprot_name'],
                entry['sequence_start'],
                entry['sequence_end'],
                entry['score'],
                entry['sequence'],
                entry['aligned_sequence']
            ))
        
        total_entries += len(entries)
        
        # Commit every 100 files
        if i % 100 == 0:
            conn.commit()
    
    conn.commit()
    logger.info(f"Processed {total_entries} PROSITE entries from {len(msa_files)} files")
    

In [None]:
def parse_prosite_go_mapping(conn, file_url: str = "https://current.geneontology.org/ontology/external2go/prosite2go"):
    """
    Download and parse Prosite to GO mapping file.
    
    Args:
        file_url: URL of the prosite2go file
    """
    logger.info("Downloading Prosite to GO mapping file")
    
    try:
        response = requests.get(file_url, timeout=60)
        response.raise_for_status()
        
        lines = response.text.strip().split('\n')
        cursor = conn.cursor()
        
        # Pattern to parse lines: PROSITE:PS00027 Description > GO:description ; GO:0000159
        pattern = r'PROSITE:(PS\d+)\s+(.+?)\s+>\s+GO:(.+?)\s+;\s+(GO:\d+)'
        
        parsed_count = 0
        for line in tqdm(lines):
            line = line.strip()
            if line.startswith('!') or not line:  # Skip comments and empty lines
                continue
            
            match = re.match(pattern, line)
            if match:
                prosite_id = match.group(1)
                prosite_description = match.group(2).strip()
                go_desc = match.group(3).strip()
                go_term = match.group(4)

                # Prosite to GO mapping table
                cursor.execute('''
                    INSERT OR IGNORE INTO prosite_go_mapping 
                    (prosite_id, prosite_description, go_id, go_name)
                    VALUES (?, ?, ?, ?)
                ''', (prosite_id, prosite_description, go_term, go_desc))
                
                parsed_count += 1
            else:
                logger.warning(f"Could not parse line: {line}")
        
        conn.commit()
        logger.info(f"Parsed and inserted {parsed_count} Prosite-GO mappings")
        
    except requests.RequestException as e:
        logger.error(f"Error downloading Prosite mapping file: {e}")
    except Exception as e:
        logger.error(f"Error parsing Prosite mapping: {e}")

In [24]:
def download_file(url, filename):
    """
    Downloads a file from a given URL and saves it to the specified filename.
    Raises an exception if the download fails.
    """
    print(f"Attempting to download {url} to {filename}...")
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Successfully downloaded {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        raise # Re-raise the exception to stop further processing

def parse_prosite_dat(filepath):
    """
    Parses the Prosite .dat file, extracts specified fields for 'PATTERN' entries,
    and yields a dictionary for each valid entry.
    Handles multi-line 'PA' fields by concatenating them.
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")

    print(f"Parsing {filepath}...")
    with open(filepath, 'r', encoding='utf-8') as f:
        current_entry = {}
        
        for line in f:
            # Remove newline character but keep leading spaces for parsing logic
            line = line.rstrip('\n')

            if line.startswith('//'):
                # End of a record. Process the accumulated data if it's a PATTERN entry.
                if current_entry and 'ID' in current_entry and '; PATTERN.' in current_entry['ID']:
                    # Clean and format the extracted data as per requirements
                    prosite_name = current_entry.get('ID', '').replace('; PATTERN.', '').strip()
                    prosite_id = current_entry.get('AC', '').replace(';', '').strip()
                    # Take only the first date part from DT line (e.g., "01-APR-1990")
                    date = current_entry.get('DT', '').strip()
                    description = current_entry.get('DE', '').strip()[:-1]
                    # Join all collected 'PA' parts with a space to form the complete pattern
                    pattern = ''.join(current_entry.get('PA', [])).strip()[:-1]

                    yield {
                        'prosite_name': prosite_name,
                        'prosite_id': prosite_id,
                        'date': date,
                        'description': description,
                        'pattern': pattern
                    }
                # Reset for the next record
                current_entry = {}
            elif line.startswith('ID   '):
                current_entry['ID'] = line[5:].strip()
            elif line.startswith('AC   '):
                current_entry['AC'] = line[5:].strip()
            elif line.startswith('DT   '):
                current_entry['DT'] = line[5:].strip()
            elif line.startswith('DE   '):
                current_entry['DE'] = line[5:].strip()
            elif line.startswith('PA   '):
                # 'PA' lines can be multi-line for a single pattern.
                # Store them in a list to be joined later.
                if 'PA' not in current_entry:
                    current_entry['PA'] = []
                current_entry['PA'].append(line[5:].strip())
            # Other lines (like CC, PR, DO, MA, etc.) are ignored as per requirements.

        # After the loop, process any remaining entry if the file doesn't end with '//'
        if current_entry and 'ID' in current_entry and '; PATTERN.' in current_entry['ID']:
            prosite_name = current_entry.get('ID', '').replace('; PATTERN.', '').strip()
            prosite_id = current_entry.get('AC', '').replace(';', '').strip()
            date = current_entry.get('DT', '').strip()
            description = current_entry.get('DE', '').strip()[:-1]
            pattern = ''.join(current_entry.get('PA', [])).strip()[:-1]
            yield {
                'prosite_name': prosite_name,
                'prosite_id': prosite_id,
                'date': date,
                'description': description,
                'pattern': pattern
            }
    print("Finished parsing.")


def create_database(db_name):
    """
    Creates a SQLite database and the 'prosite_patterns' table if they don't exist.
    The 'prosite_id' is set as the PRIMARY KEY to prevent duplicate entries.
    Returns the connection and cursor objects.
    """
    print(f"Creating/connecting to database: {db_name}")
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS prosite_patterns (
            prosite_name TEXT,
            prosite_id TEXT PRIMARY KEY,
            date TEXT,
            description TEXT,
            pattern TEXT
        )
    ''')
    conn.commit()
    print("Database table 'prosite_patterns' ensured.")
    return conn, cursor

def insert_entry(cursor, entry):
    """
    Inserts a single pattern entry into the 'prosite_patterns' table.
    Handles potential IntegrityErrors (e.g., duplicate primary keys).
    """
    try:
        cursor.execute('''
            INSERT INTO prosite_patterns (prosite_name, prosite_id, date, description, pattern)
            VALUES (?, ?, ?, ?, ?)
        ''', (
            entry['prosite_name'],
            entry['prosite_id'],
            entry['date'],
            entry['description'],
            entry['pattern']
        ))
    except sqlite3.IntegrityError:
        print(f"Warning: Duplicate prosite_id '{entry.get('prosite_id', 'N/A')}' skipped.")
    except Exception as e:
        print(f"Error inserting entry {entry.get('prosite_id', 'N/A')}: {e}")

def create_prosite_pattern_db():
    """
    Main function to orchestrate the download, parsing, and database insertion.
    """
    url = "https://ftp.expasy.org/databases/prosite/prosite.dat"
    filename = "prosite.dat"
    db_name = "/cta/share/users/uniprot/human/human.db"

    # 1. Download the file
    try:
        download_file(url, filename)
    except Exception as e:
        print(f"Exiting due to download error: {e}")
        return

    # 2. Create database and table
    conn, cursor = create_database(db_name)

    # 3. Parse the file and insert entries into the database
    entry_count = 0
    try:
        for entry in parse_prosite_dat(filename):
            insert_entry(cursor, entry)
            entry_count += 1
            if entry_count % 100 == 0:
                conn.commit() # Commit periodically for large files to save progress
                print(f"Processed {entry_count} entries...")
        conn.commit() # Final commit for any remaining entries
        print(f"Successfully loaded {entry_count} PATTERN entries into {db_name}")
    except FileNotFoundError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred during parsing or insertion: {e}")
    finally:
        conn.close()
        print("Database connection closed.")

In [27]:
# Connect to DB
db_file = "/cta/share/users/uniprot/human/human.db"
conn = sqlite3.connect(db_file)

In [4]:
uniprot_ids = list(pd.read_sql(f"SELECT Entry as uniprot_id FROM proteins", conn)['uniprot_id'])

In [None]:
# setup_database(conn)
# fetch_go_annotations(conn, uniprot_ids) --> Takes very long time. Run it in a screen with a script. 
# parse_interpro_go_mapping(conn)
# download_and_extract_prosite()
# process_prosite_data(conn)
# parse_prosite_go_mapping(conn)
# create_prosite_pattern_db()

In [None]:
pd.read_sql(f"SELECT * FROM uniprot_quickgo_annotations", conn)

Unnamed: 0,uniprot_id,go_id,go_name,go_aspect,go_evidence,evidence_code,qualifier,assigned_by,date_created
0,A0A024R1X5,GO:0007040,lysosome organization,biological_process,IEA,ECO:0000265,acts_upstream_of_or_within,Ensembl,20250313
1,A0A024R1X5,GO:0008285,negative regulation of cell population prolife...,biological_process,IEA,ECO:0000265,acts_upstream_of_or_within,Ensembl,20250313
2,A0A024R1X5,GO:0010507,negative regulation of autophagy,biological_process,IEA,ECO:0000265,acts_upstream_of_or_within,Ensembl,20250313
3,A0A024R1X5,GO:0010613,positive regulation of cardiac muscle hypertrophy,biological_process,IEA,ECO:0000265,acts_upstream_of_or_within,Ensembl,20250313
4,A0A024R1X5,GO:0048666,neuron development,biological_process,IEA,ECO:0000265,acts_upstream_of_or_within,Ensembl,20250313
...,...,...,...,...,...,...,...,...,...
1296713,X6RLR1,GO:0005869,dynactin complex,cellular_component,IEA,ECO:0000256,part_of,InterPro,20250429
1296714,X6RLT1,GO:0045892,negative regulation of DNA-templated transcrip...,biological_process,IEA,ECO:0000256,involved_in,InterPro,20250429
1296715,X6RLT1,GO:0005634,nucleus,cellular_component,IEA,ECO:0000501,located_in,UniProt,20250429
1296716,X6RLU5,GO:0006816,calcium ion transport,biological_process,IEA,ECO:0007322,involved_in,UniProt,20250428


In [127]:
pd.read_sql(f"SELECT * FROM interpro_go_mapping", conn)

Unnamed: 0,interpro_id,interpro_description,go_id,go_name
0,IPR000003,Retinoid X receptor/HNF4,GO:0003677,DNA binding
1,IPR000003,Retinoid X receptor/HNF4,GO:0003707,nuclear steroid receptor activity
2,IPR000003,Retinoid X receptor/HNF4,GO:0008270,zinc ion binding
3,IPR000003,Retinoid X receptor/HNF4,GO:0006355,regulation of DNA-templated transcription
4,IPR000003,Retinoid X receptor/HNF4,GO:0005634,nucleus
...,...,...,...,...
30199,IPR055345,"Large ribosomal subunit protein eL24-related, ...",GO:0003735,structural constituent of ribosome
30200,IPR055346,"SUF system FeS cluster assembly, SufBD",GO:0016226,iron-sulfur cluster assembly
30201,IPR055351,Urocanase,GO:0016153,urocanate hydratase activity
30202,IPR055438,Succinylglutamate desuccinylase/Aspartoacylase...,GO:0016788,"hydrolase activity, acting on ester bonds"


In [129]:
pd.read_sql(f"SELECT * FROM prosite_entries", conn)

Unnamed: 0,prosite_id,uniprot_id,prosite_name,uniprot_name,sequence_start,sequence_end,score,sequence,aligned_sequence
0,PS51299,P39679,HTH_APSES,MBP1_KLULA,6,112,20.833,IYSAKYSGVDVYEFIHPTGSIMKRKADNWVNATHILKAAKFPKAKR...,IYSAKYSGVDVYEFIHP--..-TGSIMKRKADNWVNATHILKAAKF...
1,PS51299,P33520,HTH_APSES,RES1_SCHPO,6,112,18.223,IHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRR...,IHKITYSGVEVFEYTI---..NGFPLMKRCHDNWLNATQILKIAEL...
2,PS51299,P36011,HTH_APSES,STUA_EMENI,129,235,18.080,RVTATLWEDEGSLCYQVEAKGVCVARREDNGMINGTKLLNVAGMTR...,RVTATLWEDEGSLCYQVEA..KGVCVARREDNGMINGTKLLNVAGM...
3,PS51299,P36093,HTH_APSES,PHD1_YEAST,186,292,16.870,RVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTR...,RVITTMWEDENTICYQVEA..NGISVVRRADNNMINGTKLLNVTKM...
4,PS51299,Q4X228,HTH_APSES,STUA_ASPFU,129,235,17.962,RVTATLWEDEGSLCYQVEAKGVCVARREDNHMINGTKLLNVAGMTR...,RVTATLWEDEGSLCYQVEA..KGVCVARREDNHMINGTKLLNVAGM...
...,...,...,...,...,...,...,...,...,...
620070,PS51486,Q8IVW6,REKLES,ARI3B_HUMAN,419,517,32.801,AALEQLRERLESGEPAEKKASRLSEEEQRLVQQAFQRNFFSMARQL...,AALEQLRERLESGEPAEKKASR........LSEEEQRLVQQAFQRN...
620071,PS51486,Q24573,REKLES,DRI_DROME,731,825,26.294,TTGGSVGHRHSSPVSTKKKGGAKPQSGGKDVPTEDKDASSSGKLNP...,TTGGSVGHRHSSPVSTKKKGGA........KPQSGGKDVPTEDK-D...
620072,PS51486,Q6GQD7,REKLES,ARI3A_XENLA,404,499,33.184,AALEQLREKLESGEPPEKKMALGSEEQQRIIQRTIQHNLLAMTAQL...,AALEQLREKLESGEPPEKKMAL........GSEEQQRIIQRTIQHN...
620073,PS51486,A6PWV5,REKLES,ARI3C_MOUSE,301,386,21.411,LASEATREKLAPEEPPEKRAVLMGPVDSPRLGAPPSFLPRGKAPLR...,LASEATREKLAPEEPPEKRAVL........MGPVDSPRLGAPPS--...


In [None]:
pd.read_sql(f"SELECT * FROM prosite_go_mapping", conn)

Unnamed: 0,prosite_id,prosite_description,go_id,go_name
0,PS00011,GLA_1,GO:0005509,calcium ion binding
1,PS00011,GLA_1,GO:0005576,extracellular region
2,PS00026,CHIT_BIND_I_1,GO:0008061,chitin binding
3,PS00027,HOMEOBOX_1,GO:0000981,"DNA-binding transcription factor activity, RNA..."
4,PS00027,HOMEOBOX_1,GO:0006355,regulation of DNA-templated transcription
...,...,...,...,...
2808,PS60025,CONANTOKIN,GO:0035792,host cell postsynaptic membrane
2809,PS60026,ERGTX,GO:0019870,potassium channel inhibitor activity
2810,PS60026,ERGTX,GO:0005576,extracellular region
2811,PS60028,SCORPION_CALCINE,GO:0019855,calcium channel inhibitor activity


In [22]:
pd.read_sql(f"SELECT * FROM prosite_patterns", conn)

Unnamed: 0,prosite_name,prosite_id,date,description,pattern
0,ASN_GLYCOSYLATION,PS00001,01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; ...,N-glycosylation site,N-{P}-[ST]-{P}
1,CAMP_PHOSPHO_SITE,PS00004,01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; ...,cAMP- and cGMP-dependent protein kinase phosph...,[RK](2)-x-[ST]
2,PKC_PHOSPHO_SITE,PS00005,01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; ...,Protein kinase C phosphorylation site,[ST]-x-[RK]
3,CK2_PHOSPHO_SITE,PS00006,01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; ...,Casein kinase II phosphorylation site,[ST]-x(2)-[DE]
4,TYR_PHOSPHO_SITE_1,PS00007,01-APR-1990 CREATED; 08-MAY-2019 DATA UPDATE; ...,Tyrosine kinase phosphorylation site 1,[RK]-x(2)-[DE]-x(3)-Y
...,...,...,...,...,...
1306,SCORPION_CALCINE,PS60028,01-FEB-2006 CREATED; 01-FEB-2006 DATA UPDATE; ...,Scorpion calcine family signature,C-x(6)-C-x(5)-C-C-x(3)-C-x(9)-R-C
1307,SPIDER_CSTX,PS60029,01-FEB-2006 CREATED; 01-FEB-2006 DATA UPDATE; ...,Spider toxin CSTX family signature,C-{C}(6)-C-{C}(6)-C-C-{C}(8)-C-{C}-C
1308,BACTERIOCIN_IIA,PS60030,01-MAR-2006 CREATED; 01-MAR-2006 DATA UPDATE; ...,Bacteriocin class IIa family signature,Y-G-N-G-[VL]-x-C-x(4)-C
1309,GARP2,PS60031,01-MAR-2015 CREATED; 01-MAR-2015 DATA UPDATE; ...,Glutamic acid-rich protein 2 (GARP2) signature,C-D-V-Q-T-R-[VAE]-[MVT]-[GA]-A-G-[GS]-L


In [25]:
conn.close()

In [28]:
df_protein = pd.read_sql(f"SELECT Entry as uniprot_id, Sequence as sequence FROM proteins WHERE Entry IN (SELECT uniprot_accession FROM uniref50_distilled)", conn)
df_protein = df_protein[df_protein['sequence'].str.len() < 3000].reset_index(drop=True)
df_protein

Unnamed: 0,uniprot_id,sequence
0,A0A087WZT3,MELSAEYLREKLQRDLEAEHVLPSPGGVGQVRGETAASETQLGS
1,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...
2,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...
3,A0A0B4J2F0,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...
4,A0A0C5B5G6,MRWQEMGYIFYPRKLR
...,...,...
70687,X6RL83,MLQEWLAAVGDDYAAVVWRPEGEPRFYPDEEGPKHWTKERHQFLME...
70688,X6RLN4,EVKGLFKSENCPKVISCEFAHNSNWYITFQSDTDAQQAFKYLREEV...
70689,X6RLR1,MAGLTDLQRLQARVEELERWVYGPGGARGSRKVADGLVKVQVALGN...
70690,X6RLV5,MSGYSSDRDRGRDRGFGAPRFGGSRAGPLSGKKFGNPGEKLVKKKW...


In [31]:
df_prosite_entries = pd.read_sql(f"SELECT * FROM prosite_entries", conn)
df_prosite_entries

Unnamed: 0,prosite_id,uniprot_id,prosite_name,uniprot_name,sequence_start,sequence_end,score,sequence,aligned_sequence
0,PS51299,P39679,HTH_APSES,MBP1_KLULA,6,112,20.833,IYSAKYSGVDVYEFIHPTGSIMKRKADNWVNATHILKAAKFPKAKR...,IYSAKYSGVDVYEFIHP--..-TGSIMKRKADNWVNATHILKAAKF...
1,PS51299,P33520,HTH_APSES,RES1_SCHPO,6,112,18.223,IHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRR...,IHKITYSGVEVFEYTI---..NGFPLMKRCHDNWLNATQILKIAEL...
2,PS51299,P36011,HTH_APSES,STUA_EMENI,129,235,18.080,RVTATLWEDEGSLCYQVEAKGVCVARREDNGMINGTKLLNVAGMTR...,RVTATLWEDEGSLCYQVEA..KGVCVARREDNGMINGTKLLNVAGM...
3,PS51299,P36093,HTH_APSES,PHD1_YEAST,186,292,16.870,RVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTR...,RVITTMWEDENTICYQVEA..NGISVVRRADNNMINGTKLLNVTKM...
4,PS51299,Q4X228,HTH_APSES,STUA_ASPFU,129,235,17.962,RVTATLWEDEGSLCYQVEAKGVCVARREDNHMINGTKLLNVAGMTR...,RVTATLWEDEGSLCYQVEA..KGVCVARREDNHMINGTKLLNVAGM...
...,...,...,...,...,...,...,...,...,...
620070,PS51486,Q8IVW6,REKLES,ARI3B_HUMAN,419,517,32.801,AALEQLRERLESGEPAEKKASRLSEEEQRLVQQAFQRNFFSMARQL...,AALEQLRERLESGEPAEKKASR........LSEEEQRLVQQAFQRN...
620071,PS51486,Q24573,REKLES,DRI_DROME,731,825,26.294,TTGGSVGHRHSSPVSTKKKGGAKPQSGGKDVPTEDKDASSSGKLNP...,TTGGSVGHRHSSPVSTKKKGGA........KPQSGGKDVPTEDK-D...
620072,PS51486,Q6GQD7,REKLES,ARI3A_XENLA,404,499,33.184,AALEQLREKLESGEPPEKKMALGSEEQQRIIQRTIQHNLLAMTAQL...,AALEQLREKLESGEPPEKKMAL........GSEEQQRIIQRTIQHN...
620073,PS51486,A6PWV5,REKLES,ARI3C_MOUSE,301,386,21.411,LASEATREKLAPEEPPEKRAVLMGPVDSPRLGAPPSFLPRGKAPLR...,LASEATREKLAPEEPPEKRAVL........MGPVDSPRLGAPPS--...


In [32]:
df_prosite_patterns = pd.read_sql(f"SELECT * FROM prosite_patterns", conn)
df_prosite_patterns

Unnamed: 0,prosite_name,prosite_id,date,description,pattern
0,ASN_GLYCOSYLATION,PS00001,01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; ...,N-glycosylation site,N-{P}-[ST]-{P}
1,CAMP_PHOSPHO_SITE,PS00004,01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; ...,cAMP- and cGMP-dependent protein kinase phosph...,[RK](2)-x-[ST]
2,PKC_PHOSPHO_SITE,PS00005,01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; ...,Protein kinase C phosphorylation site,[ST]-x-[RK]
3,CK2_PHOSPHO_SITE,PS00006,01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; ...,Casein kinase II phosphorylation site,[ST]-x(2)-[DE]
4,TYR_PHOSPHO_SITE_1,PS00007,01-APR-1990 CREATED; 08-MAY-2019 DATA UPDATE; ...,Tyrosine kinase phosphorylation site 1,[RK]-x(2)-[DE]-x(3)-Y
...,...,...,...,...,...
1306,SCORPION_CALCINE,PS60028,01-FEB-2006 CREATED; 01-FEB-2006 DATA UPDATE; ...,Scorpion calcine family signature,C-x(6)-C-x(5)-C-C-x(3)-C-x(9)-R-C
1307,SPIDER_CSTX,PS60029,01-FEB-2006 CREATED; 01-FEB-2006 DATA UPDATE; ...,Spider toxin CSTX family signature,C-{C}(6)-C-{C}(6)-C-C-{C}(8)-C-{C}-C
1308,BACTERIOCIN_IIA,PS60030,01-MAR-2006 CREATED; 01-MAR-2006 DATA UPDATE; ...,Bacteriocin class IIa family signature,Y-G-N-G-[VL]-x-C-x(4)-C
1309,GARP2,PS60031,01-MAR-2015 CREATED; 01-MAR-2015 DATA UPDATE; ...,Glutamic acid-rich protein 2 (GARP2) signature,C-D-V-Q-T-R-[VAE]-[MVT]-[GA]-A-G-[GS]-L


In [None]:
df_prosite_entries[df_prosite_entries['uniprot_id'].isin(df_protein['uniprot_id'].unique())]

Unnamed: 0,prosite_id,uniprot_id,prosite_name,uniprot_name,sequence_start,sequence_end,score,sequence,aligned_sequence
295,PS01199,P62906,RIBOSOMAL_L1,RL10A_HUMAN,117,136,,IKQIPRILGPGLNKAGKFPS,IkqIpriLGpgLNKAGkFPS
999,PS50069,Q9UJX6,CULLIN_2,ANC2_HUMAN,502,700,27.386,ISLLVSIYGSKDLFINEYRSLLADRLLHQFSFSPEREIRNVELLKL...,-----------------...............--------------...
1004,PS50069,Q13617,CULLIN_2,CUL2_HUMAN,386,618,61.360,KAPELLAKYCDNLLKKSAKGMTENEVEDRLTSFITVFKYIDDKDVF...,KAPELLAKYCDNLLKKS...............AKGMTENEVEDRLT...
1008,PS50069,Q13620,CULLIN_2,CUL4B_HUMAN,558,786,63.828,KPAELIAKYVDSKLRAGNKEATDEELEKMLDKIMIIFRFIYGKDVF...,KPAELIAKYVDSKLRAG...............NKEATDEELEKMLD...
1009,PS50069,Q14999,CULLIN_2,CUL7_HUMAN,1215,1506,30.866,HVSEQFARHIDQQIQGSRIGGAQEMERLAQLQQCLQAVLIFSGLEI...,HVSEQFARHIDQQIQGSri...........ggAQEMERLAQLQQCL...
...,...,...,...,...,...,...,...,...,...
620055,PS51321,P23193,TFIIS_CENTRAL,TCEA1_HUMAN,140,256,41.067,VRLKCREMLAAALRTGDDYIAIGADEEELGSQIEEAIYQEIRNTDM...,VRLKCREMLAAALR...................................
620056,PS51321,Q6ZMY3,TFIIS_CENTRAL,SPOC1_HUMAN,608,728,33.705,VRGTVVRSMQEVLWTRLRELPDPVLSEEVVEGIAAGIEAALWDLTQ...,VRGTVVRSMQEVLWtr.................................
620063,PS51486,Q99856,REKLES,ARI3A_HUMAN,444,541,34.376,AALEQLREKLESAEPPEKKMALVADEQQRLMQRALQQNFLAMAAQL...,AALEQLREKLESAEPPEKKMAL........VADEQQRLMQRALQQN...
620068,PS51486,A6NKF2,REKLES,ARI3C_HUMAN,304,389,21.297,LALGPTREKLAPEEPPEKRAVLMGPMDPPRPCMPPSFLPRGKVPLR...,LALGPTREKLAPEEPPEKRAVL........MGPMDPPRPCMPPS--...


In [None]:
df_prosite_entries[df_prosite_entries['prosite_id'].isin(df_prosite_patterns['prosite_id'].unique())]

Unnamed: 0,prosite_id,uniprot_id,prosite_name,uniprot_name,sequence_start,sequence_end,score,sequence,aligned_sequence
33,PS01199,Q6KIF1,RIBOSOMAL_L1,RL1_MYCM1,126,144,,MPTLGKYGKVLGPKGLMPN,MptLgk.YGkvLGPKGlMPN
34,PS01199,Q8F0R9,RIBOSOMAL_L1,RL1_LEPIN,117,135,,MKDVGKLGPILGRKGLMPK,MkdVgk.LGpiLGRKGlMPK
35,PS01199,Q72UB1,RIBOSOMAL_L1,RL1_LEPIC,117,135,,MKDVGKLGPILGRKGLMPK,MkdVgk.LGpiLGRKGlMPK
36,PS01199,P0CX43,RIBOSOMAL_L1,RL1A_YEAST,117,136,,IKQVPRLLGPQLSKAGKFPT,IkqVprlLGpqLSKAGkFPT
37,PS01199,P0CX44,RIBOSOMAL_L1,RL1B_YEAST,117,136,,IKQVPRLLGPQLSKAGKFPT,IkqVprlLGpqLSKAGkFPT
...,...,...,...,...,...,...,...,...,...
619984,PS00661,Q920B0,FERM_2,FRM4B_MOUSE,230,259,,YLKIKGLTRGQAVVQYMKIVEALPTYGVHY,YlkikgltrgQAvvqYMki.VeaLptYGvHY
619985,PS00661,Q71LX4,FERM_2,TLN2_MOUSE,286,315,,HKNCGEMSEIEAKVKYVKLARSLRTYGVSF,HkncgemseiEAkvkYVkl.ArsLrtYGvSF
619986,PS00661,O70318,FERM_2,E41L2_MOUSE,372,401,,HKTHRGLSPAQADSQFLENAKRLSMYGVDL,HkthrglspaQAdsqFLen.AkrLsmYGvDL
619987,PS00661,Q9WU22,FERM_2,PTN4_MOUSE,192,221,,HQQHVGLSPAEAEFNYLNAARTLELYGVEF,HqqhvglspaEAefnYLna.ArtLelYGvEF


In [66]:
df = df_prosite_entries[(df_prosite_entries['uniprot_id'].isin(df_protein['uniprot_id'].unique())) & 
                        (df_prosite_entries['prosite_id'].isin(df_prosite_patterns['prosite_id'].unique()))]
df

Unnamed: 0,prosite_id,uniprot_id,prosite_name,uniprot_name,sequence_start,sequence_end,score,sequence,aligned_sequence
295,PS01199,P62906,RIBOSOMAL_L1,RL10A_HUMAN,117,136,,IKQIPRILGPGLNKAGKFPS,IkqIpriLGpgLNKAGkFPS
1193,PS01148,O00468,UPF0033,AGRIN_HUMAN,1121,1145,,LDAEGSNCPATKVFQGVLELEGVEG,LDaeGsnCPaTkVfqgvlelegveG
1195,PS00708,P48147,PRO_ENDOPEP_SER,PPCE_HUMAN,529,559,,DFQCAAEYLIKEGYTSPKRLTINGGSNGGLL,DfqcAaeyLikegytspkrltinGgSnGGLL
1210,PS00708,P13798,PRO_ENDOPEP_SER,ACPH_HUMAN,562,592,,DVQFAVEQVLQEEHFDASHVALMGGSHGGFI,DvqfAveqVlqeehfdashvalmGgShGGFI
1216,PS00708,P27487,PRO_ENDOPEP_SER,DPP4_HUMAN,605,635,,DQIEAARQFSKMGFVDNKRIAIWGWSYGGYV,DqieAarqFskmgfvdnkriaiwGwSyGGYV
...,...,...,...,...,...,...,...,...,...
619972,PS00661,Q86UX7,FERM_2,URP2_HUMAN,528,557,,HQNVAQLSLAEAQLRFIQAWQSLPDFGISY,HqnvaqlslaEAqlrFIqa.WqsLpdFGiSY
619973,PS00661,Q9P2Q2,FERM_2,FRM4A_HUMAN,191,220,,YKKLNGQTRGQAIVNYMSIVESLPTYGVHY,YkklngqtrgQAivnYMsi.VesLptYGvHY
619977,PS00661,Q9HCS5,FERM_2,E41LA_HUMAN,181,210,,HKTLMGQIPSEAELNYLRTAKSLEMYGVDL,HktlmgqipsEAelnYLrt.AksLemYGvDL
619978,PS00661,Q9Y2L6,FERM_2,FRM4B_HUMAN,230,259,,YLKIKGLTRGQAVVQYMKIVEALPTYGVHY,YlkikgltrgQAvvqYMki.VeaLptYGvHY


In [91]:
df['sequence_len'] = df['sequence'].str.len()
df['aligned_sequence_len'] = df['aligned_sequence'].str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sequence_len'] = df['sequence'].str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['aligned_sequence_len'] = df['aligned_sequence'].str.len()


In [68]:
df['uniprot_id'].value_counts()

uniprot_id
P35555    127
P35556    125
Q04721    107
P46531    105
Q9UM47     92
         ... 
Q02846      1
O75343      1
P51828      1
Q02153      1
P25092      1
Name: count, Length: 6723, dtype: int64

In [180]:
df['prosite_id'].value_counts()[df['prosite_id'].value_counts() > 1]

prosite_id
PS00028    5970
PS01186     819
PS00022     688
PS00010     426
PS00237     422
           ... 
PS00613       2
PS00745       2
PS00935       2
PS00033       2
PS00739       2
Name: count, Length: 637, dtype: int64

In [188]:
df = df[df['prosite_id'].isin(list(df['prosite_id'].value_counts()[df['prosite_id'].value_counts() > 1].index))].reset_index(drop=True)

In [189]:
df

Unnamed: 0,prosite_id,uniprot_id,prosite_name,uniprot_name,sequence_start,sequence_end,score,sequence,aligned_sequence,sequence_len,aligned_sequence_len
0,PS00708,P48147,PRO_ENDOPEP_SER,PPCE_HUMAN,529,559,,DFQCAAEYLIKEGYTSPKRLTINGGSNGGLL,DfqcAaeyLikegytspkrltinGgSnGGLL,31,31
1,PS00708,P13798,PRO_ENDOPEP_SER,ACPH_HUMAN,562,592,,DVQFAVEQVLQEEHFDASHVALMGGSHGGFI,DvqfAveqVlqeehfdashvalmGgShGGFI,31,31
2,PS00708,P27487,PRO_ENDOPEP_SER,DPP4_HUMAN,605,635,,DQIEAARQFSKMGFVDNKRIAIWGWSYGGYV,DqieAarqFskmgfvdnkriaiwGwSyGGYV,31,31
3,PS00708,Q12884,PRO_ENDOPEP_SER,SEPR_HUMAN,599,629,,DQITAVRKFIEMGFIDEKRIAIWGWSYGGYV,DqitAvrkFiemgfidekriaiwGwSyGGYV,31,31
4,PS01010,P54107,CRISP_2,CRIS1_HUMAN,170,181,,LYVCHYCHEGND,LYvCHYcHeGND,12,12
...,...,...,...,...,...,...,...,...,...,...,...
17127,PS00661,Q86UX7,FERM_2,URP2_HUMAN,528,557,,HQNVAQLSLAEAQLRFIQAWQSLPDFGISY,HqnvaqlslaEAqlrFIqa.WqsLpdFGiSY,30,31
17128,PS00661,Q9P2Q2,FERM_2,FRM4A_HUMAN,191,220,,YKKLNGQTRGQAIVNYMSIVESLPTYGVHY,YkklngqtrgQAivnYMsi.VesLptYGvHY,30,31
17129,PS00661,Q9HCS5,FERM_2,E41LA_HUMAN,181,210,,HKTLMGQIPSEAELNYLRTAKSLEMYGVDL,HktlmgqipsEAelnYLrt.AksLemYGvDL,30,31
17130,PS00661,Q9Y2L6,FERM_2,FRM4B_HUMAN,230,259,,YLKIKGLTRGQAVVQYMKIVEALPTYGVHY,YlkikgltrgQAvvqYMki.VeaLptYGvHY,30,31


In [190]:
aa = df.groupby('prosite_id').agg({'sequence_len': list}).reset_index()
aa

Unnamed: 0,prosite_id,sequence_len
0,PS00010,"[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1..."
1,PS00011,"[26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 2..."
2,PS00012,"[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."
3,PS00014,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
4,PS00018,"[13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 1..."
...,...,...
632,PS01359,"[42, 77, 68, 83, 43, 50, 43, 45, 56, 45, 39, 8..."
633,PS01360,"[40, 38, 49, 37, 38, 41, 35, 37, 36, 45, 37, 3..."
634,PS40000,"[30, 30, 30, 30, 30, 30]"
635,PS60001,"[8, 8, 8]"


In [191]:
bb = df.groupby('prosite_id').agg({'aligned_sequence_len': list}).reset_index()
bb

Unnamed: 0,prosite_id,aligned_sequence_len
0,PS00010,"[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1..."
1,PS00011,"[27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2..."
2,PS00012,"[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."
3,PS00014,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
4,PS00018,"[13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 1..."
...,...,...
632,PS01359,"[117, 117, 117, 117, 117, 117, 117, 117, 117, ..."
633,PS01360,"[52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 5..."
634,PS40000,"[32, 32, 32, 32, 32, 32]"
635,PS60001,"[8, 8, 8]"


In [192]:
bb

Unnamed: 0,prosite_id,aligned_sequence_len
0,PS00010,"[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1..."
1,PS00011,"[27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2..."
2,PS00012,"[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."
3,PS00014,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
4,PS00018,"[13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 1..."
...,...,...
632,PS01359,"[117, 117, 117, 117, 117, 117, 117, 117, 117, ..."
633,PS01360,"[52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 5..."
634,PS40000,"[32, 32, 32, 32, 32, 32]"
635,PS60001,"[8, 8, 8]"


In [193]:
aa

Unnamed: 0,prosite_id,sequence_len
0,PS00010,"[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1..."
1,PS00011,"[26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 2..."
2,PS00012,"[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."
3,PS00014,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
4,PS00018,"[13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 1..."
...,...,...
632,PS01359,"[42, 77, 68, 83, 43, 50, 43, 45, 56, 45, 39, 8..."
633,PS01360,"[40, 38, 49, 37, 38, 41, 35, 37, 36, 45, 37, 3..."
634,PS40000,"[30, 30, 30, 30, 30, 30]"
635,PS60001,"[8, 8, 8]"


In [194]:
cc = aa[~aa['sequence_len'].apply(lambda lst: all(x==lst[0] for x in lst[1:]))]
cc

Unnamed: 0,prosite_id,sequence_len
7,PS00021,"[13, 13, 13, 14, 14, 14, 14, 13, 13, 13, 13, 1..."
9,PS00023,"[41, 42, 42, 42, 40, 42, 42, 42, 42, 42, 42, 4..."
10,PS00024,"[15, 16, 16, 16, 16, 16, 15, 16, 16, 16, 16, 1..."
11,PS00025,"[21, 22, 21, 21, 21, 21, 22, 22, 21]"
13,PS00028,"[22, 21, 21, 21, 22, 21, 21, 21, 21, 24, 24, 2..."
...,...,...
630,PS01357,"[28, 31, 28, 28, 27, 26, 27, 27, 26, 28, 28, 2..."
631,PS01358,"[20, 20, 20, 20, 20, 20, 20, 20, 20, 22, 22, 2..."
632,PS01359,"[42, 77, 68, 83, 43, 50, 43, 45, 56, 45, 39, 8..."
633,PS01360,"[40, 38, 49, 37, 38, 41, 35, 37, 36, 45, 37, 3..."


In [195]:
dd = df[~df['prosite_id'].isin(cc['prosite_id'].unique())]
dd

Unnamed: 0,prosite_id,uniprot_id,prosite_name,uniprot_name,sequence_start,sequence_end,score,sequence,aligned_sequence,sequence_len,aligned_sequence_len
0,PS00708,P48147,PRO_ENDOPEP_SER,PPCE_HUMAN,529,559,,DFQCAAEYLIKEGYTSPKRLTINGGSNGGLL,DfqcAaeyLikegytspkrltinGgSnGGLL,31,31
1,PS00708,P13798,PRO_ENDOPEP_SER,ACPH_HUMAN,562,592,,DVQFAVEQVLQEEHFDASHVALMGGSHGGFI,DvqfAveqVlqeehfdashvalmGgShGGFI,31,31
2,PS00708,P27487,PRO_ENDOPEP_SER,DPP4_HUMAN,605,635,,DQIEAARQFSKMGFVDNKRIAIWGWSYGGYV,DqieAarqFskmgfvdnkriaiwGwSyGGYV,31,31
3,PS00708,Q12884,PRO_ENDOPEP_SER,SEPR_HUMAN,599,629,,DQITAVRKFIEMGFIDEKRIAIWGWSYGGYV,DqitAvrkFiemgfidekriaiwGwSyGGYV,31,31
4,PS01010,P54107,CRISP_2,CRIS1_HUMAN,170,181,,LYVCHYCHEGND,LYvCHYcHeGND,12,12
...,...,...,...,...,...,...,...,...,...,...,...
17103,PS00452,Q08462,GUANYLATE_CYCLASE_1,ADCY2_HUMAN,1009,1032,,GVIGAQKPQYDIWGNTVNVASRMD,GVI.GaqkpqYdIWGNTVNvasrmD,24,25
17104,PS00452,O95622,GUANYLATE_CYCLASE_1,ADCY5_HUMAN,573,596,,GVLGLRKWQFDVWSNDVTLANHME,GVL.GlrkwqFdVWSNDVTlanhmE,24,25
17105,PS00452,O95622,GUANYLATE_CYCLASE_1,ADCY5_HUMAN,1187,1210,,GVIGARKPQYDIWGNTVNVASRMD,GVI.GarkpqYdIWGNTVNvasrmD,24,25
17106,PS00452,P25092,GUANYLATE_CYCLASE_1,GUC2C_HUMAN,931,954,,GVVGIKMPRYCLFGDTVNTASRME,GVV.GikmprYcLFGDTVNtasrmE,24,25


In [200]:
ff = dd[dd['aligned_sequence'].str.contains('\\.')].groupby('prosite_id').agg({'aligned_sequence': list}).reset_index()
ff

Unnamed: 0,prosite_id,aligned_sequence
0,PS00011,"[EcyEEiCvyeearEvfenevvtde.FW, EclEErCsweearEyf..."
1,PS00031,"[CgvCg.Dratgf.HFnamtCegCkgFFrR, CaiCg.Drssgk.H..."
2,PS00054,"[MhGLimG.GLeVisitDnTPiPHN, LrALarS.GMkIgrieDvT..."
3,PS00067,"[EvaGFVlNRlqyAIIseawr.LVeeG, NcfGFVgNRmlnPYYnq..."
4,PS00112,"[CP.SNLGT, CP.SNLGT, CP.SNLGT, CP.SNLGT]"
5,PS00180,"[LDGSSgvlfpfy.DADtsM, FDGSStlqsegs.NSDmyL]"
6,PS00207,"[EYeLLCpDntrkp...VdkfkdChlArvpsHaVV, DFaLLClDg..."
7,PS00344,"[CvNCgals..TplWRRdgt.Ghyl...CNAC, CtNChttn..Tt..."
8,PS00375,[WlpQndLLghpmtrAFITHAGshGvyeSIcngv.PMvmmPlfgDQ...
9,PS00396,"[EkLYTqgy.........ISYpRTE, ErLYTqgy.........IS..."


In [206]:
ff.iloc[15], ff.iloc[15]['aligned_sequence']

(prosite_id                                                    PS00495
 aligned_sequence    [CnkdIyvdldMkgiNYnssvaksaqeCqerCTddvhChFFtYatr...
 Name: 15, dtype: object,
 ['CnkdIyvdldMkgiNYnssvaksaqeCqerCTddvhChFFtYatrqfpslehrni.ClLKhTqtgtptritkldkvvSGfSLksC',
  'ChssFyhdtdFlgeELdivaaksheaCqklCTnavrCqFFtYtpaqascnegkgk.CyLKlSsngsptkilhgrggiSGyTLrlC',
  'CvtqLlkdtcFeggDIttvftpsakyCqvvCTyhprClLFtFtaespsedptrwftCvLKdSvtetlprvnrtaai.SGySFkqC',
  'CirdIfpntvFadsNIdsvmapdafvCgriCThhpgClFFtFfsqewpkesqrnl.ClLKtSesglpstrikkskalSGfSLqsC',
  'CltqLyenafFrggDVasmytpnaqyCqmrCTfhprClLFsFlpassindmekrfgCfLKdSvtgtlpkvhrtgav.SGhSLkqC',
  'ChrdIykgvdMrgvNFnvskvssveeCqkrCTsnirCqFFsYatqtfhkaeyrnn.ClLKySpggtptaikvlsnveSGfSLkpC',
  'ChmnIfqhlaFsdvDVarvltpdafvCrtiCTyhpnClFFtFytnvwkiesqrnv.ClLKtSesgtpssstpqentiSGySLltC',
  'ChskIypgvdFggeELnvtfvkgvnvCqetCTkmirCqFFtYsllpedckeekck.CfLRlSmdgsptriaygtqgsSGySLrlC'])

In [203]:
ff['aligned_sequence'].apply(lambda lst: all([lst[0].index('.') == x.index('.') for x in lst[1:]]))


0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15    False
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30     True
31     True
32     True
33     True
Name: aligned_sequence, dtype: bool

In [88]:
df

Unnamed: 0,prosite_id,uniprot_id,prosite_name,uniprot_name,sequence_start,sequence_end,score,sequence,aligned_sequence,sequence_len
295,PS01199,P62906,RIBOSOMAL_L1,RL10A_HUMAN,117,136,,IKQIPRILGPGLNKAGKFPS,IkqIpriLGpgLNKAGkFPS,20
1193,PS01148,O00468,UPF0033,AGRIN_HUMAN,1121,1145,,LDAEGSNCPATKVFQGVLELEGVEG,LDaeGsnCPaTkVfqgvlelegveG,25
1195,PS00708,P48147,PRO_ENDOPEP_SER,PPCE_HUMAN,529,559,,DFQCAAEYLIKEGYTSPKRLTINGGSNGGLL,DfqcAaeyLikegytspkrltinGgSnGGLL,31
1210,PS00708,P13798,PRO_ENDOPEP_SER,ACPH_HUMAN,562,592,,DVQFAVEQVLQEEHFDASHVALMGGSHGGFI,DvqfAveqVlqeehfdashvalmGgShGGFI,31
1216,PS00708,P27487,PRO_ENDOPEP_SER,DPP4_HUMAN,605,635,,DQIEAARQFSKMGFVDNKRIAIWGWSYGGYV,DqieAarqFskmgfvdnkriaiwGwSyGGYV,31
...,...,...,...,...,...,...,...,...,...,...
619972,PS00661,Q86UX7,FERM_2,URP2_HUMAN,528,557,,HQNVAQLSLAEAQLRFIQAWQSLPDFGISY,HqnvaqlslaEAqlrFIqa.WqsLpdFGiSY,30
619973,PS00661,Q9P2Q2,FERM_2,FRM4A_HUMAN,191,220,,YKKLNGQTRGQAIVNYMSIVESLPTYGVHY,YkklngqtrgQAivnYMsi.VesLptYGvHY,30
619977,PS00661,Q9HCS5,FERM_2,E41LA_HUMAN,181,210,,HKTLMGQIPSEAELNYLRTAKSLEMYGVDL,HktlmgqipsEAelnYLrt.AksLemYGvDL,30
619978,PS00661,Q9Y2L6,FERM_2,FRM4B_HUMAN,230,259,,YLKIKGLTRGQAVVQYMKIVEALPTYGVHY,YlkikgltrgQAvvqYMki.VeaLptYGvHY,30


In [109]:
df_prosite_patterns[df_prosite_patterns['prosite_id'] == 'PS60024']['pattern'].iloc[0]

'C-x(6)-C-x(6)-C-C-x(2)-C-x(2)-C-x-C-x(6)-C-x-C-x(6,9)-C'

In [209]:
df[df['prosite_id'] == 'PS60024']

Unnamed: 0,prosite_id,uniprot_id,prosite_name,uniprot_name,sequence_start,sequence_end,score,sequence,aligned_sequence,sequence_len,aligned_sequence_len
9250,PS60024,P42127,AGOUTI_1,ASIP_HUMAN,93,132,,CVATRNSCKPPAPACCDPCASCQCRFFRSACSCRVLSLNC,CvatrnsCkppapaCCdpCasCqCrffrsaCsCrvlsln...C,40,43
9251,PS60024,O00253,AGOUTI_1,AGRP_HUMAN,87,129,,CVRLHESCLGQQVPCCDPCATCYCRFFNAFCYCRKLGTAMNPC,CvrlhesClgqqvpCCdpCatCyCrffnafCyCrklgtamnpC,43,43


In [87]:
bb

Unnamed: 0,prosite_id,sequence_len
7,PS00021,"[13, 13, 13, 14, 14, 14, 14, 13, 13, 13, 13, 1..."
9,PS00023,"[41, 42, 42, 42, 40, 42, 42, 42, 42, 42, 42, 4..."
10,PS00024,"[15, 16, 16, 16, 16, 16, 15, 16, 16, 16, 16, 1..."
11,PS00025,"[21, 22, 21, 21, 21, 21, 22, 22, 21]"
13,PS00028,"[22, 21, 21, 21, 22, 21, 21, 21, 21, 24, 24, 2..."
...,...,...
915,PS01357,"[28, 31, 28, 28, 27, 26, 27, 27, 26, 28, 28, 2..."
916,PS01358,"[20, 20, 20, 20, 20, 20, 20, 20, 20, 22, 22, 2..."
917,PS01359,"[42, 77, 68, 83, 43, 50, 43, 45, 56, 45, 39, 8..."
918,PS01360,"[40, 38, 49, 37, 38, 41, 35, 37, 36, 45, 37, 3..."
