In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests

In [5]:
lab=pd.read_excel (r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Label free\Label-free analyses.xlsx", header= 0, index_col = "Accession")
lab.head()

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,ALS 9,Control 1,Control 2,Control 3,Control 4,Control 5,Control 6,Control 7,Control 8
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
P01024,375,361,368,389,370,355,336,330,387,280,301,265,275,277,270,219,253
P41222,238,285,214,281,247,301,379,264,369,361,347,253,334,310,311,266,285
P01034,191,202,157,161,151,187,181,168,187,205,198,180,183,180,172,135,166
P0C0L4,162,174,175,192,188,189,175,166,169,165,160,167,168,166,166,150,175
P01023,178,145,169,147,144,162,153,133,153,149,128,138,141,133,134,131,134


In [14]:
from Bio import Entrez, SeqIO

In [15]:
def parse_protein_lengths(protein_raw: str):
    prot_lens = {}
    
    # Separate each protein, seperated by \n\n.
    for prot in protein_raw.split('\n\n'):
        # Ignore any empty lines
        if not prot:
            continue
            
        # Split protein data by header first then multiple lines of data
        split_prot = prot.split('\n')
        header = split_prot[0]   # Take header
        prot_data = split_prot[1:]  # Take the rest
        
        prot_id = header.split('|')[1][0:6]  # Header is between the first two | characters
        prot_len = sum([len(prot_line) for prot_line in prot_data])  # Add up the lengths of the lines of data
        
        prot_lens[prot_id] = prot_len
        
    return prot_lens

# Function to get protein length from NCBI
def get_protein_lengths(protein_accessions):
    Entrez.email = "yoana.bobeva@qmul.ac.uk"  # Always include your email
    handle = Entrez.efetch(db="protein", id=protein_accessions, rettype="fasta", retmode="text")
    fasta_record = handle.read()
    handle.close()
    
    protein_lengths = parse_protein_lengths(fasta_record)
    return protein_lengths
    

In [16]:
full_prot_lens = get_protein_lengths(list(lab.index)) # for each protein name in the df get the protein lenght

In [17]:
prot_lens_df = pd.DataFrame(full_prot_lens.items(), columns=['Accession', 'Protein Length']).set_index('Accession')
prot_lens_df

Unnamed: 0_level_0,Protein Length
Accession,Unnamed: 1_level_1
P01024,1663
P41222,190
P01034,146
P0C0L4,1744
P01023,1474
...,...
Q6TFL3,1326
Q6ZRH9,516
Q702N8,1843
Q9H2F9,335


In [18]:
#Merge the protein lenght results with the df by Accession column
lab_with_lens = lab.merge(prot_lens_df, left_index=True, right_index=True)
lab_with_lens

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,ALS 9,Control 1,Control 2,Control 3,Control 4,Control 5,Control 6,Control 7,Control 8,Protein Length
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
P01024,375,361,368,389,370,355,336,330,387,280,301,265,275,277,270,219,253,1663
P41222,238,285,214,281,247,301,379,264,369,361,347,253,334,310,311,266,285,190
P01034,191,202,157,161,151,187,181,168,187,205,198,180,183,180,172,135,166,146
P0C0L4,162,174,175,192,188,189,175,166,169,165,160,167,168,166,166,150,175,1744
P01023,178,145,169,147,144,162,153,133,153,149,128,138,141,133,134,131,134,1474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q6TFL3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1326
Q6ZRH9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,516
Q702N8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1843
Q9H2F9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,335


Reference https://biopython.org/docs/1.75/api/Bio.Entrez.html
    

In [19]:
#calculating the Normalised Spectral Count (NSC)= spectral count/protein lenght
lab_nsc = lab.div(prot_lens_df['Protein Length'], axis=0)
lab_nsc

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,ALS 9,Control 1,Control 2,Control 3,Control 4,Control 5,Control 6,Control 7,Control 8
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
A0A583,0.000000,0.008696,0.017391,0.017391,0.008696,0.008696,0.008696,0.000000,0.026087,0.017391,0.008696,0.008696,0.008696,0.000000,0.008696,0.000000,0.008696
A0AV02,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.002801,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A1A4F0,0.000000,0.014815,0.007407,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A1X4Q0,,,,,,,,,,,,,,,,,
A2A2E1,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9Y6I9,0.006390,0.006390,0.000000,0.006390,0.003195,0.003195,0.000000,0.000000,0.000000,0.003195,0.000000,0.000000,0.003195,0.000000,0.000000,0.000000,0.003195
Q9Y6N7,0.001211,0.001211,0.000606,0.000606,0.000606,0.000606,0.001211,0.000000,0.001211,0.000606,0.000606,0.000606,0.000000,0.001211,0.000000,0.001211,0.000000
Q9Y6Q9,0.001404,0.000702,0.000000,0.000702,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Q9Y6R7,0.002220,0.002590,0.002220,0.002405,0.001110,0.002960,0.003515,0.002220,0.002960,0.001850,0.001665,0.002775,0.001665,0.001850,0.003700,0.001665,0.001295


In [20]:
#calculating the Normalised spectral abundance factor = NSC/NSC for all proteins
lab_nsaf = lab_nsc.div(lab_nsc.sum(axis=0))
lab_nsaf

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,ALS 9,Control 1,Control 2,Control 3,Control 4,Control 5,Control 6,Control 7,Control 8
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
A0A583,0.000000,0.000582,0.001263,0.001254,0.000674,0.000626,0.000600,0.000000,0.001795,0.001225,0.000608,0.000744,0.000651,0.000000,0.000688,0.000000,0.000765
A0AV02,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000193,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A1A4F0,0.000000,0.000992,0.000538,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A1X4Q0,,,,,,,,,,,,,,,,,
A2A2E1,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9Y6I9,0.000446,0.000428,0.000000,0.000461,0.000248,0.000230,0.000000,0.000000,0.000000,0.000225,0.000000,0.000000,0.000239,0.000000,0.000000,0.000000,0.000281
Q9Y6N7,0.000085,0.000081,0.000044,0.000044,0.000047,0.000044,0.000084,0.000000,0.000083,0.000043,0.000042,0.000052,0.000000,0.000091,0.000000,0.000113,0.000000
Q9Y6Q9,0.000098,0.000047,0.000000,0.000051,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Q9Y6R7,0.000155,0.000173,0.000161,0.000173,0.000086,0.000213,0.000243,0.000174,0.000204,0.000130,0.000116,0.000238,0.000125,0.000139,0.000293,0.000155,0.000114


In [23]:
lab_nsaf.shape

(1697, 17)

In [30]:
# Replace 0 with NaN
lab_nona=lab_nsaf.replace(0, np.nan)

In [32]:
# Columns belonging to Group 1 and 2
ALS_columns = [col for col in lab_nona.columns if "ALS" in col]
Control_columns = [col for col in lab_nona.columns if "Control" in col]


# Filter rows where at least 50% of Group 1 values are present, do the same for Group 2
filtered_lab = lab_nona[lab_nona[ALS_columns].notna().mean(axis=1) >= 0.5]
cont_filt = filtered_lab[filtered_lab[Control_columns].notna().mean(axis=1) >= 0.5]

In [33]:
cont_filt.shape

(463, 17)

In [37]:
cont_filt.head()

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,ALS 9,Control 1,Control 2,Control 3,Control 4,Control 5,Control 6,Control 7,Control 8,Mean Control,Mean ALS
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
A0A583,,0.000582,0.001263,0.001254,0.000674,0.000626,0.0006,,0.001795,0.001225,0.000608,0.000744,0.000651,,0.000688,,0.000765,0.00078,0.000971
A6NLU5,0.000735,0.000705,0.000765,0.000506,0.000544,0.000505,0.000726,0.000552,0.000241,0.000247,0.000736,0.000901,0.000788,0.001053,0.000833,0.000327,0.000617,0.000688,0.000587
A6NMZ7,,5.9e-05,,6.4e-05,,3.2e-05,3e-05,6.9e-05,3e-05,3.1e-05,3.1e-05,,3.3e-05,,,4.1e-05,,3.4e-05,4.8e-05
A8MU93,0.000592,0.001135,0.003078,0.001222,0.001314,0.001829,0.000585,0.001332,0.000583,0.001194,,0.001451,,,0.002683,,0.001491,0.001705,0.001297
A8MVJ9,0.000201,0.000193,0.000209,,0.000447,0.000207,0.000199,,0.000397,0.000406,0.000403,0.000247,,0.000432,0.000228,,,0.000343,0.000265


In [35]:
cont_filt.to_excel(r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Label free\label free nona als-hc.xlsx")

In [4]:
#df['average'] = df.mean(numeric_only=True, axis=1)
cont_filt['Mean Control'] = cont_filt.loc[:, cont_filt.columns.str.startswith('Control ')].mean(axis=1)
cont_filt['Mean ALS'] = cont_filt.loc[:, cont_filt.columns.str.startswith('ALS ')].mean(axis=1)

In [44]:
cont_filt.head()

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,ALS 9,Control 1,Control 2,Control 3,Control 4,Control 5,Control 6,Control 7,Control 8,Mean Control,Mean ALS
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
A0A583,,0.000582,0.001263,0.001254,0.000674,0.000626,0.0006,,0.001795,0.001225,0.000608,0.000744,0.000651,,0.000688,,0.000765,0.00078,0.000971
A6NLU5,0.000735,0.000705,0.000765,0.000506,0.000544,0.000505,0.000726,0.000552,0.000241,0.000247,0.000736,0.000901,0.000788,0.001053,0.000833,0.000327,0.000617,0.000688,0.000587
A6NMZ7,,5.9e-05,,6.4e-05,,3.2e-05,3e-05,6.9e-05,3e-05,3.1e-05,3.1e-05,,3.3e-05,,,4.1e-05,,3.4e-05,4.8e-05
A8MU93,0.000592,0.001135,0.003078,0.001222,0.001314,0.001829,0.000585,0.001332,0.000583,0.001194,,0.001451,,,0.002683,,0.001491,0.001705,0.001297
A8MVJ9,0.000201,0.000193,0.000209,,0.000447,0.000207,0.000199,,0.000397,0.000406,0.000403,0.000247,,0.000432,0.000228,,,0.000343,0.000265


In [45]:
# Automatically identify group columns by their prefixes
als = [col for col in cont_filt.columns if col.startswith('ALS ')]
control = [col for col in cont_filt.columns if col.startswith('Control ')]


In [51]:
# Perform t-tests row-wise
results = []
for i, row in cont_filt.iterrows():
    t_stat, p_value = ttest_ind(row[als].dropna(), row[control].dropna())
    results.append({'Gene names': i, 't_stat': t_stat, 'p_value': p_value})

In [52]:
# Convert results to a DataFrame
results_df = pd.DataFrame(results)

In [53]:
results_df

Unnamed: 0,Gene names,t_stat,p_value
0,A0A583,0.899549,0.387629
1,A6NLU5,-0.922798,0.370718
2,A6NMZ7,1.394730,0.200605
3,A8MU93,-0.894659,0.390125
4,A8MVJ9,-1.285757,0.227503
...,...,...,...
458,Q9Y646,-3.481436,0.004056
459,Q9Y6A4,2.349734,0.040659
460,Q9Y6N7,-0.295745,0.772933
461,Q9Y6R7,0.454871,0.655716


In [54]:
# Add the results DataFrame to the original data DataFrame
lab_stat = pd.concat([cont_filt, results_df.set_index('Gene names')], axis=1)
lab_stat.head()

Unnamed: 0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,ALS 9,Control 1,...,Control 3,Control 4,Control 5,Control 6,Control 7,Control 8,Mean Control,Mean ALS,t_stat,p_value
A0A583,,0.000582,0.001263,0.001254,0.000674,0.000626,0.0006,,0.001795,0.001225,...,0.000744,0.000651,,0.000688,,0.000765,0.00078,0.000971,0.899549,0.387629
A6NLU5,0.000735,0.000705,0.000765,0.000506,0.000544,0.000505,0.000726,0.000552,0.000241,0.000247,...,0.000901,0.000788,0.001053,0.000833,0.000327,0.000617,0.000688,0.000587,-0.922798,0.370718
A6NMZ7,,5.9e-05,,6.4e-05,,3.2e-05,3e-05,6.9e-05,3e-05,3.1e-05,...,,3.3e-05,,,4.1e-05,,3.4e-05,4.8e-05,1.39473,0.200605
A8MU93,0.000592,0.001135,0.003078,0.001222,0.001314,0.001829,0.000585,0.001332,0.000583,0.001194,...,0.001451,,,0.002683,,0.001491,0.001705,0.001297,-0.894659,0.390125
A8MVJ9,0.000201,0.000193,0.000209,,0.000447,0.000207,0.000199,,0.000397,0.000406,...,0.000247,,0.000432,0.000228,,,0.000343,0.000265,-1.285757,0.227503


In [57]:
# Apply Benjamini-Hochberg correction (optional)
_, p_adjusted, _, _ = multipletests(lab_stat['p_value'], method='bonferroni')
lab_stat['p_adjusted'] = p_adjusted
lab_stat.head()


Unnamed: 0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,ALS 9,Control 1,...,Control 4,Control 5,Control 6,Control 7,Control 8,Mean Control,Mean ALS,t_stat,p_value,p_adjusted
A0A583,,0.000582,0.001263,0.001254,0.000674,0.000626,0.0006,,0.001795,0.001225,...,0.000651,,0.000688,,0.000765,0.00078,0.000971,0.899549,0.387629,1.0
A6NLU5,0.000735,0.000705,0.000765,0.000506,0.000544,0.000505,0.000726,0.000552,0.000241,0.000247,...,0.000788,0.001053,0.000833,0.000327,0.000617,0.000688,0.000587,-0.922798,0.370718,1.0
A6NMZ7,,5.9e-05,,6.4e-05,,3.2e-05,3e-05,6.9e-05,3e-05,3.1e-05,...,3.3e-05,,,4.1e-05,,3.4e-05,4.8e-05,1.39473,0.200605,1.0
A8MU93,0.000592,0.001135,0.003078,0.001222,0.001314,0.001829,0.000585,0.001332,0.000583,0.001194,...,,,0.002683,,0.001491,0.001705,0.001297,-0.894659,0.390125,1.0
A8MVJ9,0.000201,0.000193,0.000209,,0.000447,0.000207,0.000199,,0.000397,0.000406,...,,0.000432,0.000228,,,0.000343,0.000265,-1.285757,0.227503,1.0


In [7]:
lab_stat.to_excel(r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Label free\label free nsaf stats.xlsx")


Unnamed: 0.1,Unnamed: 0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,ALS 9,...,Control 4,Control 5,Control 6,Control 7,Control 8,Mean Control,Mean ALS,t_stat,p_value,p_adjusted
0,A0A583,,0.000582,0.001263,0.001254,0.000674,0.000626,0.0006,,0.001795,...,0.000651,,0.000688,,0.000765,0.00078,0.000971,0.899549,0.387629,1.0
1,A6NLU5,0.000735,0.000705,0.000765,0.000506,0.000544,0.000505,0.000726,0.000552,0.000241,...,0.000788,0.001053,0.000833,0.000327,0.000617,0.000688,0.000587,-0.922798,0.370718,1.0
2,A6NMZ7,,5.9e-05,,6.4e-05,,3.2e-05,3e-05,6.9e-05,3e-05,...,3.3e-05,,,4.1e-05,,3.4e-05,4.8e-05,1.39473,0.200605,1.0
3,A8MU93,0.000592,0.001135,0.003078,0.001222,0.001314,0.001829,0.000585,0.001332,0.000583,...,,,0.002683,,0.001491,0.001705,0.001297,-0.894659,0.390125,1.0
4,A8MVJ9,0.000201,0.000193,0.000209,,0.000447,0.000207,0.000199,,0.000397,...,,0.000432,0.000228,,,0.000343,0.000265,-1.285757,0.227503,1.0


In [8]:
lab_stat= pd.read_excel(r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Label free\label free nsaf stats.xlsx", header=0)
lab_stat.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,...,Control 4,Control 5,Control 6,Control 7,Control 8,Mean Control,Mean ALS,t_stat,p_value,p_adjusted
0,0,A0A583,,0.000582,0.001263,0.001254,0.000674,0.000626,0.0006,,...,0.000651,,0.000688,,0.000765,0.00078,0.000971,0.899549,0.387629,1.0
1,1,A6NLU5,0.000735,0.000705,0.000765,0.000506,0.000544,0.000505,0.000726,0.000552,...,0.000788,0.001053,0.000833,0.000327,0.000617,0.000688,0.000587,-0.922798,0.370718,1.0
2,2,A6NMZ7,,5.9e-05,,6.4e-05,,3.2e-05,3e-05,6.9e-05,...,3.3e-05,,,4.1e-05,,3.4e-05,4.8e-05,1.39473,0.200605,1.0
3,3,A8MU93,0.000592,0.001135,0.003078,0.001222,0.001314,0.001829,0.000585,0.001332,...,,,0.002683,,0.001491,0.001705,0.001297,-0.894659,0.390125,1.0
4,4,A8MVJ9,0.000201,0.000193,0.000209,,0.000447,0.000207,0.000199,,...,,0.000432,0.000228,,,0.000343,0.000265,-1.285757,0.227503,1.0


In [9]:
lab_stat["FC ALS-C"]= lab_stat["Mean ALS"]/lab_stat["Mean Control"]

In [10]:
#Log 2 FC
lab_stat["Log2FC ALS-C"]=np.log2(lab_stat["FC ALS-C"])

In [11]:
lab_stat.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ALS 1,ALS 2,ALS 3,ALS 4,ALS 5,ALS 6,ALS 7,ALS 8,...,Control 6,Control 7,Control 8,Mean Control,Mean ALS,t_stat,p_value,p_adjusted,FC ALS-C,Log2FC ALS-C
0,0,A0A583,,0.000582,0.001263,0.001254,0.000674,0.000626,0.0006,,...,0.000688,,0.000765,0.00078,0.000971,0.899549,0.387629,1.0,1.244247,0.315273
1,1,A6NLU5,0.000735,0.000705,0.000765,0.000506,0.000544,0.000505,0.000726,0.000552,...,0.000833,0.000327,0.000617,0.000688,0.000587,-0.922798,0.370718,1.0,0.85288,-0.229586
2,2,A6NMZ7,,5.9e-05,,6.4e-05,,3.2e-05,3e-05,6.9e-05,...,,4.1e-05,,3.4e-05,4.8e-05,1.39473,0.200605,1.0,1.394296,0.479536
3,3,A8MU93,0.000592,0.001135,0.003078,0.001222,0.001314,0.001829,0.000585,0.001332,...,0.002683,,0.001491,0.001705,0.001297,-0.894659,0.390125,1.0,0.760733,-0.394539
4,4,A8MVJ9,0.000201,0.000193,0.000209,,0.000447,0.000207,0.000199,,...,0.000228,,,0.000343,0.000265,-1.285757,0.227503,1.0,0.771549,-0.374169


In [12]:
lab_stat.to_excel(r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Label free\label free nsaf stats.xlsx")


In [None]:
proc=pd.read_excel(r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Label free\label free nsaf stats.xlsx", header=0)
proc.head()

In [None]:
orig=pd.read_excel (r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Label free\Label-free analyses original.xlsx", header=0)
orig.head()

In [None]:
# Merge the original and the processed df by 'Accession' so you have the gene name in the proccessed version as well
combined = pd.merge(proc, orig[['Accession', 'Gene']], on='Accession', how='left')
combined.head()

In [37]:
combined.to_excel(r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Label free\label free nsaf stats dictionary.xlsx")


In [33]:
# Sort the DataFrame by the absolute values of the column
lab_stat_sorted= combined.loc[combined["Log2FC ALS-C"].abs().sort_values(ascending=False).index]

In [34]:
lab_stat_sorted.to_excel(r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Label free\label free nsaf stats sorted.xlsx")

In [35]:
filtered_lab = lab_stat_sorted[lab_stat_sorted['p_value'] <=0.05]


In [36]:
filtered_lab.to_excel(r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Label free\label free nsaf stats filtered.xlsx")