In [9]:
import pandas as pd
# import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle

from src.pssm_scoring import *


In [2]:
try:
    import logomaker
    print("logomaker is installed.")
except ImportError:
    print("logomaker is not installed, installing now...")
    %pip install logomaker
    import logomaker



logomaker is installed.


In [3]:
standard_amino_acids = 'ACDEFGHIKLMNPQRSTVWY'  # 20 standard amino acids
list_standard_amino_acids = list(standard_amino_acids)

# generate the logo for a protein sequence
def generate_logo(sequence, sequence_scores, segment_length=50):
    sequence_length = len(sequence)

    # Loop through each segment of the sequence
    for start in range(0, sequence_length, segment_length):
        end = min(start + segment_length, sequence_length)  # Ensure we don't go out of bounds
        effective_segment_length = min(end-start, segment_length)


        # Extract the segment of the sequence and corresponding scores
        segment_sequence = sequence[start:end]
        segment_scores = sequence_scores[start:end]

        score_df = pd.DataFrame(index=range(effective_segment_length), columns=list_standard_amino_acids).fillna(0)

        # Map the pssm scores into the interval [0,1] using either logistic function, softmax, or minmax
        normalized_scores = logistic(segment_scores)
        # normalized_scores = softmax(sequence_scores)
        # normalized_scores = MinMax(sequence_scores)

        for i, (aa,score) in enumerate(zip(segment_sequence,normalized_scores)):
            score_df.at[i,aa] = score


        nn_logo = logomaker.Logo(score_df)

        # style using Logo methods
        nn_logo.style_spines(visible=False)
        nn_logo.style_spines(spines=['left','bottom'], visible=True)
        xs = np.arange(0, effective_segment_length,10)
        xs_labels = np.arange(start, end,10)

        # style using Axes methods
        nn_logo.ax.set_xlim([0, effective_segment_length-1])
        nn_logo.ax.set_xticks(xs)
        nn_logo.ax.set_xticklabels(xs_labels.astype(str).tolist())
        nn_logo.ax.set_ylim([0, 1])
        nn_logo.ax.set_yticks([0, 0.25,.5,0.75 , 1])
        nn_logo.ax.set_yticklabels(['0', '0.25','0.5', '0.75', '1'])
        nn_logo.ax.set_ylabel('Sorting signal importance', labelpad=-1)

        plt.show()



def logistic(x):
    return 1/(1+np.exp(-x))


def softmax(x):
    e_x = np.exp(x-np.max(x))
    return e_x / e_x.sum(axis=0)

def MinMax(x):
    x_min = np.min(x)
    x_max = np.max(x)
    return (x - x_min) / (x_max - x_min)

In [4]:
# compute the pssm scores at each location of the protein sequence and return the locations and scores above a threshold

def compute_scores(pssm, sequence, threshold=2):
    
    sequence_scores = pssm.calculate(sequence)
    results = pssm.search(sequence,threshold)
    return sequence_scores, results



In [10]:
# Load the nuclear pssm from the file
with open('data/nls_pssm.pkl', 'rb') as f:
    nls_pssm = pickle.load(f)

ModuleNotFoundError: No module named 'pssm_scoring'

In [6]:
# Example 1
sequence = """MSNVNLSVSDFWRVMMRVCWLVRQDSRHQRIRLPHLEAVVIGRGPETKITDKKCSRQQVQ
LKAECNKGYVKVKQVGVNPTSIDSVVIGKDQEVKLQPGQVLHMVNELYPYIVEFEEEAKN
PGLETHRKRKRSGNSDSIERDAAQEAEAGTGLEPGSNSGQCSVPLKKGKDAPIKKESLGH
WSQGLKISMQDPKMQVYKDEQVVVIKDKYPKARYHWLVLPWTSISSLKAVAREHLELLKH
MHTVGEKVIVDFAGSSKLRFRLGYHAIPSMSHVHLHVISQDFDSPCLKNKKHWNSFNTEY
FLESQAVIEMVQEAGRVTVRDGMPELLKLPLRCHECQQLLPSIPQLKEHLRKHWTQ"""

sequence = sequence.replace('\n','')

"""
compute the pssm scores for each location of the protein sequence and search for locations whose scores are higher than
the threshold
"""
sequence_scores, results = compute_scores(nls_pssm, sequence, threshold = 2)

print(f"The scores are {sequence_scores}")
print()
for index, score in results:
    print(f"Match begins at {index+1} with score {score} and amino acid {sequence[index]}")

# Generate the logo
generate_logo(sequence, sequence_scores)

NameError: name 'nls_pssm' is not defined

In [7]:
# Example 2
sequence = """MDGVSSEANEENDNIERPVRRRHSSILKPPRSPLQDLRGGNERVQESNALRNKKNSRRVS
FADTIKVFQTESHMKIVRKSEMEGCSAMVPSQLQLLPPGFKRFSCLSLPETETGENLLLI
QNKKLEDNYCEITGMNTLLSAPIHTQMQQKEFSIIEHTRERKHANDQTVIFSDENQMDLT
SSHTVMITKGLLDNPISEKSTKIDTTSFLANLKLHTEDSRMKKEVNFSVDQNTSSENKID
FNDFIKRLKTGKCSAFPDVPDKENFEIPIYSKEPNSASSTHQMHVSLKEDENNSNITRLF
REKDDGMNFTQCHTANIQTLIPTSSETNSRESKGNDITIYGNDFMDLTFNHTLQILPATG
NFSEIENQTQNAMDVTTGYGTKASGNKTVFKSKQNTAFQDLSINSADKIHITRSHIMGAE
THIVSQTCNQDARILAMTPESIYSNPSIQGCKTVFYSSCNDAMEMTKCLSNMREEKNLLK
HDSNYAKMYCNPDAMSSLTEKTIYSGEENMDITKSHTVAIDNQIFKQDQSNVQIAAAPTP
EKEMMLQNLMTTSEDGKMNVNCNSVPHVSKERIQQSLSNPLSISLTDRKTELLSGENMDL
TESHTSNLGSQVPLAAYNLAPESTSESHSQSKSSSDECEEITKSRNEPFQRSDIIAKNSL
TDTWNKDKDWVLKILPYLDKDSPQSADCNQEIATSHNIVYCGGVLDKQITNRNTVSWEQS
LFSTTKPLFSSGQFSMKNHDTAISSHTVKSVLGQNSKLAEPLRKSLSNPTPDYCHDKMII
CSEEEQNMDLTKSHTVVIGFGPSELQELGKTNLEHTTGQLTTMNRQIAVKVEKCGKSPIE
KSGVLKSNCIMDVLEDESVQKPKFPKEKQNVKIWGRKSVGGPKIDKTIVFSEDDKNDMDI
TKSYTIEINHRPLLEKRDCHLVPLAGTSETILYTCRQDDMEITRSHTTALECKTVSPDEI
TTRPMDKTVVFVDNHVELEMTESHTVFIDYQEKERTDRPNFELSQRKSLGTPTVICTPTE
ESVFFPGNGESDRLVANDSQLTPLEEWSNNRGPVEVADNMELSKSATCKNIKDVQSPGFL
NEPLSSKSQRRKSLKLKNDKTIVFSENHKNDMDITQSCMVEIDNESALEDKEDFHLAGAS
KTILYSCGQDDMEITRSHTTALECKTLLPNEIAIRPMDKTVLFTDNYSDLEVTDSHTVFI
DCQATEKILEENPKFGIGKGKNLGVSFPKDNSCVQEIAEKQALAVGNKIVLHTEQKQQLF
AATNRTTNEIIKFHSAAMDEKVIGKVVDQACTLEKAQVESCQLNNRDRRNVDFTSSHATA
VCGSSDNYSCLPNVISCTDNLEGSAMLLCDKDEEKANYCPVQNDLAYANDFASEYYLESE
GQPLSAPCPLLEKEEVIQTSTKGQLDCVITLHKDQDLIKDPRNLLANQTLVYSQDLGEMT
KLNSKRVSFKLPKDQMKVYVDDIYVIPQPHFSTDQPPLPKKGQSSINKEEVILSKAGNKS
LNIIENSSAPICENKPKILNSEEWFAAACKKELKENIQTTNYNTALDFHSNSDVTKQVIQ
THVNAGEAPDPVITSNVPCFHSIKPNLNNLNGKTGEFLAFQTVHLPPLPEQLLELGNKAH
NDMHIVQATEIHNINIISSNAKDSRDEENKKSHNGAETTSLPPKTVFKDKVRRCSLGIFL
PRLPNKRNCSVTGIDDLEQIPADTTDINHLETQPVSSKDSGIGSVAGKLNLSPSQYINEE
NLPVYPDEINSSDSINIETEEKALIETYQKEISPYENKMGKTCNSQKRTWVQEEEDIHKE
KKIRKNEIKFSDTTQDREIFDHHTEEDIDKSANSVLIKNLSRTPSSCSSSLDSIKADGTS
LDFSTYRSSQMESQFLRDTICEESLREKLQDGRITIREFFILLQVHILIQKPRQSNLPGN
FTVNTPPTPEDLMLSQYVYRPKIQIYREDCEARRQKIEELKLSASNQDKLLVDINKNLWE
KMRHCSDKELKAFGIYLNKIKSCFTKMTKVFTHQGKVALYGKLVQSAQNEREKLQIKIDE
MDKILKKIDNCLTEMETETKNLEDEEKNNPVEEWDSEMRAAEKELEQLKTEEEELQRNLL
ELEVQKEQTLAQIDFMQKQRNRTEELLDQLSLSEWDVVEWSDDQAVFTFVYDTIQLTITF
EESVVGFPFLDKRYRKIVDVNFQSLLDEDQAPPSSLLVHKLIFQYVEEKESWKKTCTTQH
QLPKMLEEFSLVVHHCRLLGEEIEYLKRWGPNYNLMNIDINNNELRLLFSSSAAFAKFEI
TLFLSAYYPSVPLPSTIQNHVGNTSQDDIATILSKVPLENNYLKNVVKQIYQDLFQDCHF
YH"""
sequence = sequence.replace('\n','')

sequence_scores, results = compute_scores(nls_pssm, sequence, threshold=2)

print(f"The scores are {sequence_scores}")
print()
for index, score in results:
    print(f"Match begins at {index+1} with score {score} and amino acid {sequence[index]}")

generate_logo(sequence, sequence_scores)

NameError: name 'nls_pssm' is not defined