In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats
from numpy import sqrt
from statsmodels.stats.multitest import multipletests
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
exo=pd.read_excel(r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Proteomic of exosome\Exosome.xlsx", header=0, index_col='Gene names')
exo.head()

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,Control 1,Control 2,Control 3
Gene names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KRT82,32.208834,4.143358,66.506596,,739.471067,14121.09838
TERT,2355.994601,14.327965,48.80564,199615.0078,190.781979,1521.097753
GNB4,,,4.004981,283.621231,,1.090234
F13B,33.801424,,3.237875,2128.76644,0.293458,40.117075
S100A8,1979.604181,2384.608743,120.818042,89521.90037,143763.9554,545.448831


In [3]:
exo.shape

(327, 6)

In [4]:
# Columns belonging to Group 1 and 2
ALS_columns = [col for col in exo.columns if "ALS" in col]
Control_columns = [col for col in exo.columns if "Control" in col]


# Filter rows where at least 50% of Group 1 values are present, do the same for Group 2
filtered_als = exo[exo[ALS_columns].notna().mean(axis=1) >= 0.5]
cont_filt = filtered_als[filtered_als[Control_columns].notna().mean(axis=1) >= 0.5]

In [5]:
cont_filt.shape

(322, 6)

In [7]:
#cont_filt.to_excel(r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Proteomic of exosome\Exosome drop NA rows.xlsx")

In [8]:
#Normalizing in 0-5 range 
trans = MinMaxScaler(feature_range=(0,5))
scal = trans.fit_transform(cont_filt)

In [9]:
# convert the array back to a dataframe
down = pd.DataFrame(scal)
down

Unnamed: 0,0,1,2,3,4,5
0,0.000030,0.000004,2.824785e-05,,0.000774,0.014642
1,0.002223,0.000014,2.064877e-05,0.185187,0.000199,0.001562
2,0.000031,,1.086392e-06,0.001965,0.000000,0.000024
3,0.001868,0.002328,5.156392e-05,0.083046,0.150452,0.000549
4,0.000015,0.000020,4.466397e-07,0.000008,0.002031,0.000006
...,...,...,...,...,...,...
317,0.020526,0.027453,3.898862e-03,0.014210,0.017802,0.028218
318,0.012231,0.012740,5.651330e-03,0.012331,0.015114,0.012110
319,0.000782,0.002853,2.558945e-04,0.002799,0.000828,0.000560
320,0.004326,0.007806,7.530481e-03,0.006654,0.011743,0.012063


In [10]:
# Assign the index from df_with_index to df_no_index
down.index = cont_filt.index

# Assign the header from df_with_header to df_no_header
down.columns = cont_filt.columns[:down.shape[1]]
down.head()

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,Control 1,Control 2,Control 3
Gene names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KRT82,3e-05,4e-06,2.824785e-05,,0.000774,0.014642
TERT,0.002223,1.4e-05,2.064877e-05,0.185187,0.000199,0.001562
F13B,3.1e-05,,1.086392e-06,0.001965,0.0,2.4e-05
S100A8,0.001868,0.002328,5.156392e-05,0.083046,0.150452,0.000549
PSAPL1,1.5e-05,2e-05,4.466397e-07,8e-06,0.002031,6e-06


In [11]:
# Automatically identify group columns by their prefixes
als = [col for col in down.columns if col.startswith('ALS ')]
control = [col for col in down.columns if col.startswith('Control ')]

# Perform t-tests row-wise
results = []
for i, row in cont_filt.iterrows():
    t_stat, p_value = ttest_ind(row[als].dropna(), row[control].dropna())
    results.append({'Gene names': i, 't_stat': t_stat, 'p_value': p_value})
    
# Convert results to a DataFrame
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Gene names,t_stat,p_value
0,KRT82,-1.483028,0.234697
1,TERT,-1.000663,0.373616
2,F13B,-0.776289,0.494162
3,S100A8,-1.830986,0.141066
4,PSAPL1,-1.011767,0.368879
...,...,...,...
317,CLSTN1,-0.029543,0.977847
318,AGRN,-0.106443,0.920355
319,CHL1,-0.008195,0.993854
320,EDIL3,0.006402,0.995198


In [12]:
# Add the results DataFrame to the original data DataFrame
prot_stat = pd.concat([down, results_df.set_index('Gene names')], axis=1)
prot_stat.head()

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,Control 1,Control 2,Control 3,t_stat,p_value
Gene names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
KRT82,3e-05,4e-06,2.824785e-05,,0.000774,0.014642,-1.483028,0.234697
TERT,0.002223,1.4e-05,2.064877e-05,0.185187,0.000199,0.001562,-1.000663,0.373616
F13B,3.1e-05,,1.086392e-06,0.001965,0.0,2.4e-05,-0.776289,0.494162
S100A8,0.001868,0.002328,5.156392e-05,0.083046,0.150452,0.000549,-1.830986,0.141066
PSAPL1,1.5e-05,2e-05,4.466397e-07,8e-06,0.002031,6e-06,-1.011767,0.368879


In [13]:
# Apply Benjamini-Hochberg correction (optional)
_, p_adjusted, _, _ = multipletests(prot_stat['p_value'], method='fdr_bh')
prot_stat['p_adjusted'] = p_adjusted
prot_stat.head()

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,Control 1,Control 2,Control 3,t_stat,p_value,p_adjusted
Gene names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
KRT82,3e-05,4e-06,2.824785e-05,,0.000774,0.014642,-1.483028,0.234697,0.762722
TERT,0.002223,1.4e-05,2.064877e-05,0.185187,0.000199,0.001562,-1.000663,0.373616,0.762722
F13B,3.1e-05,,1.086392e-06,0.001965,0.0,2.4e-05,-0.776289,0.494162,0.792532
S100A8,0.001868,0.002328,5.156392e-05,0.083046,0.150452,0.000549,-1.830986,0.141066,0.762722
PSAPL1,1.5e-05,2e-05,4.466397e-07,8e-06,0.002031,6e-06,-1.011767,0.368879,0.762722


In [14]:
_, p_adjusted, _, _ = multipletests(prot_stat['p_value'], method='bonferroni')
prot_stat['p_adjusted_bonf'] = p_adjusted
prot_stat.head()

Unnamed: 0_level_0,ALS 1,ALS 2,ALS 3,Control 1,Control 2,Control 3,t_stat,p_value,p_adjusted,p_adjusted_bonf
Gene names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
KRT82,3e-05,4e-06,2.824785e-05,,0.000774,0.014642,-1.483028,0.234697,0.762722,1.0
TERT,0.002223,1.4e-05,2.064877e-05,0.185187,0.000199,0.001562,-1.000663,0.373616,0.762722,1.0
F13B,3.1e-05,,1.086392e-06,0.001965,0.0,2.4e-05,-0.776289,0.494162,0.792532,1.0
S100A8,0.001868,0.002328,5.156392e-05,0.083046,0.150452,0.000549,-1.830986,0.141066,0.762722,1.0
PSAPL1,1.5e-05,2e-05,4.466397e-07,8e-06,0.002031,6e-06,-1.011767,0.368879,0.762722,1.0


In [15]:
#df['average'] = df.mean(numeric_only=True, axis=1)
prot_stat['Mean Control'] = prot_stat.loc[:, prot_stat.columns.str.startswith('Control ')].mean(axis=1)
prot_stat['Mean ALS'] = prot_stat.loc[:, prot_stat.columns.str.startswith('ALS ')].mean(axis=1)

In [16]:
#Fold Change
prot_stat["FC ALS-C"]= prot_stat["Mean ALS"]/prot_stat["Mean Control"]

In [17]:
#Log 2 FC
prot_stat["Log2FC ALS-C"]=np.log2(prot_stat["FC ALS-C"])

In [19]:
prot_stat.to_excel(r"C:\Users\Client\OneDrive - Queen Mary, University of London\PhD\PhD back up\Programing full\R\Datasets\Proteomic of exosome\Exosome T-test.xlsx")