In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
lab=pd.read_excel (r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Label free\label free.xlsx", header= 0, index_col = "Accession")
lab.head()

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,ALS 9,Control 1,Control 2,Control 3,Control 4,Control 5,Control 6,Control 7,Control 8
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
P01024,375,361,368,389,370,355,336,330,387,280,301,265,275,277,270,219,253
P41222,238,285,214,281,247,301,379,264,369,361,347,253,334,310,311,266,285
P01034,191,202,157,161,151,187,181,168,187,205,198,180,183,180,172,135,166
P0C0L4,162,174,175,192,188,189,175,166,169,165,160,167,168,166,166,150,175
P01023,178,145,169,147,144,162,153,133,153,149,128,138,141,133,134,131,134


In [3]:
from Bio import Entrez, SeqIO

In [4]:
def parse_protein_lengths(protein_raw: str):
    prot_lens = {}
    
    # Separate each protein, seperated by \n\n.
    for prot in protein_raw.split('\n\n'):
        # Ignore any empty lines
        if not prot:
            continue
            
        # Split protein data by header first then multiple lines of data
        split_prot = prot.split('\n')
        header = split_prot[0]   # Take header
        prot_data = split_prot[1:]  # Take the rest
        
        prot_id = header.split('|')[1][0:6]  # Header is between the first two | characters
        prot_len = sum([len(prot_line) for prot_line in prot_data])  # Add up the lengths of the lines of data
        
        prot_lens[prot_id] = prot_len
        
    return prot_lens

# Function to get protein length from NCBI
def get_protein_lengths(protein_accessions):
    Entrez.email = "yoana.bobeva@qmul.ac.uk"  # Always include your email
    handle = Entrez.efetch(db="protein", id=protein_accessions, rettype="fasta", retmode="text")
    fasta_record = handle.read()
    handle.close()
    
    protein_lengths = parse_protein_lengths(fasta_record)
    return protein_lengths
    

In [5]:
full_prot_lens = get_protein_lengths(list(lab.index)) # for each protein name in the df get the protein lenght

In [6]:
prot_lens_df = pd.DataFrame(full_prot_lens.items(), columns=['Accession', 'Protein Length']).set_index('Accession')
prot_lens_df

Unnamed: 0_level_0,Protein Length
Accession,Unnamed: 1_level_1
P01024,1663
P41222,190
P01034,146
P0C0L4,1744
P01023,1474
...,...
Q6TFL3,1326
Q6ZRH9,516
Q702N8,1843
Q9H2F9,335


In [8]:
#Merge the protein lenght results with the df by Accession column
lab_with_lens = lab.merge(prot_lens_df, left_index=True, right_index=True)
lab_with_lens

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,ALS 9,Control 1,Control 2,Control 3,Control 4,Control 5,Control 6,Control 7,Control 8,Protein Length
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
P01024,375,361,368,389,370,355,336,330,387,280,301,265,275,277,270,219,253,1663
P41222,238,285,214,281,247,301,379,264,369,361,347,253,334,310,311,266,285,190
P01034,191,202,157,161,151,187,181,168,187,205,198,180,183,180,172,135,166,146
P0C0L4,162,174,175,192,188,189,175,166,169,165,160,167,168,166,166,150,175,1744
P01023,178,145,169,147,144,162,153,133,153,149,128,138,141,133,134,131,134,1474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q6TFL3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1326
Q6ZRH9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,516
Q702N8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1843
Q9H2F9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,335


Reference https://biopython.org/docs/1.75/api/Bio.Entrez.html
    

In [13]:
#calculating the Normalised Spectral Count (NSC)= spectral count/protein lenght
lab_nsc = lab.div(prot_lens_df['Protein Length'], axis=0)
lab_nsc

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,ALS 9,Control 1,Control 2,Control 3,Control 4,Control 5,Control 6,Control 7,Control 8
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
A0A583,0.000000,0.008696,0.017391,0.017391,0.008696,0.008696,0.008696,0.000000,0.026087,0.017391,0.008696,0.008696,0.008696,0.000000,0.008696,0.000000,0.008696
A0AV02,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.002801,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A1A4F0,0.000000,0.014815,0.007407,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A1X4Q0,,,,,,,,,,,,,,,,,
A2A2E1,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9Y6I9,0.006390,0.006390,0.000000,0.006390,0.003195,0.003195,0.000000,0.000000,0.000000,0.003195,0.000000,0.000000,0.003195,0.000000,0.000000,0.000000,0.003195
Q9Y6N7,0.001211,0.001211,0.000606,0.000606,0.000606,0.000606,0.001211,0.000000,0.001211,0.000606,0.000606,0.000606,0.000000,0.001211,0.000000,0.001211,0.000000
Q9Y6Q9,0.001404,0.000702,0.000000,0.000702,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Q9Y6R7,0.002220,0.002590,0.002220,0.002405,0.001110,0.002960,0.003515,0.002220,0.002960,0.001850,0.001665,0.002775,0.001665,0.001850,0.003700,0.001665,0.001295


In [20]:
#calculating the Normalised spectral abundance factor = NSC/NSC for all proteins
lab_nsaf = lab_nsc.div(lab_nsc.sum(axis=0))
lab_nsaf

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,ALS 9,Control 1,Control 2,Control 3,Control 4,Control 5,Control 6,Control 7,Control 8
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
A0A583,0.000000,0.000582,0.001263,0.001254,0.000674,0.000626,0.000600,0.000000,0.001795,0.001225,0.000608,0.000744,0.000651,0.000000,0.000688,0.000000,0.000765
A0AV02,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000193,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A1A4F0,0.000000,0.000992,0.000538,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A1X4Q0,,,,,,,,,,,,,,,,,
A2A2E1,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9Y6I9,0.000446,0.000428,0.000000,0.000461,0.000248,0.000230,0.000000,0.000000,0.000000,0.000225,0.000000,0.000000,0.000239,0.000000,0.000000,0.000000,0.000281
Q9Y6N7,0.000085,0.000081,0.000044,0.000044,0.000047,0.000044,0.000084,0.000000,0.000083,0.000043,0.000042,0.000052,0.000000,0.000091,0.000000,0.000113,0.000000
Q9Y6Q9,0.000098,0.000047,0.000000,0.000051,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Q9Y6R7,0.000155,0.000173,0.000161,0.000173,0.000086,0.000213,0.000243,0.000174,0.000204,0.000130,0.000116,0.000238,0.000125,0.000139,0.000293,0.000155,0.000114


In [21]:
lab_nsaf.to_excel(r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Label free\label free nsaf.xlsx")