In [1]:
#import necessary library
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection
import operator

In [2]:
def GFS (series, theta1 = 0.05, theta2 = 0.15):
    rank = series.rank(method = 'average', ascending = False) # largest -> smallest
    high_rank = np.quantile(rank, theta1) # q1
    low_rank = np.quantile(rank, theta2) # q2
    
    output = []
    for i in rank:
        if i < high_rank:
            output.append(1)
        elif ((i >= high_rank) and (i <= low_rank)):
            temp = (i - low_rank) / (high_rank - low_rank)
            output.append(temp)
        else:
            output.append(0)
    return pd.Series(output)

In [3]:
def perform_GFS(dataframe, theta1 = 0.05, theta2 = 0.15):
    index = dataframe.index
    column = dataframe.columns
    
    df_GFS = pd.DataFrame()
    for columns in dataframe:
        sample = columns
        scored_list = GFS(dataframe[sample])
        df_GFS = pd.concat([df_GFS, scored_list], axis=1)
    
    df_GFS.index = index
    df_GFS.columns = column
    return (df_GFS)

In [5]:
#read data and metadata
GSE43358_withoutQN = pd.read_csv("dataframe_files/GSE43358_withoutQN.csv", index_col=0)
metadata = pd.read_csv("dataframe_files/metadata_subset_GSE43358.csv", index_col= 0)

In [6]:
GSE43358_withoutQN

Unnamed: 0_level_0,GSM1060909,GSM1060910,GSM1060911,GSM1060912,GSM1060913,GSM1060914,GSM1060915,GSM1060916,GSM1060917,GSM1060918,...,GSM1060957,GSM1060958,GSM1060959,GSM1060960,GSM1060961,GSM1060962,GSM1060963,GSM1060964,GSM1060965,GSM1060966
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1007_s_at,10.188780,9.962777,10.032398,9.214594,9.829366,9.082002,9.497702,9.599882,10.056246,9.497774,...,8.117539,9.446948,8.885882,8.933331,9.525502,8.490911,9.618735,9.190977,8.275251,9.309001
1053_at,7.403849,6.564261,6.976259,6.781573,6.759277,6.841969,7.535065,7.435392,7.364623,6.435484,...,7.092708,7.803411,7.197789,6.628817,6.861754,6.703667,7.526706,6.941258,7.251745,7.255652
117_at,5.948930,6.536095,5.473324,6.218211,5.490909,5.400712,7.796307,6.808738,5.571715,4.719477,...,5.469290,5.495843,5.353782,5.931500,5.242922,5.129269,6.129595,5.729089,6.429800,6.016900
121_at,6.998864,7.035181,6.263843,6.792121,6.469531,6.733990,7.021524,6.741721,6.684385,6.656057,...,6.610905,6.973430,6.362282,6.641244,6.692130,6.062184,6.846753,6.893562,6.731056,6.695316
1255_g_at,2.829625,2.693465,2.073785,2.178817,2.214327,2.276154,2.961425,2.221415,2.189473,2.168044,...,2.218601,2.449555,2.568569,2.153643,2.597603,2.369518,2.477939,2.362445,2.485852,2.620372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AFFX-ThrX-5_at,8.010415,6.744503,2.964290,5.328729,5.183479,5.272722,7.933039,5.986203,5.059674,4.876238,...,6.171054,6.332296,5.756505,6.868064,6.713837,6.828522,8.456851,8.242763,6.296457,6.245338
AFFX-ThrX-M_at,8.661574,7.441604,2.892905,5.721304,5.626630,5.016970,8.608850,6.434170,5.561896,5.370713,...,6.697781,6.714201,6.306748,7.326152,7.142821,7.283352,9.025523,8.853293,6.735507,6.766038
AFFX-TrpnX-3_at,2.063522,1.967356,1.860255,2.011838,1.829054,1.703161,2.148197,1.995315,1.873334,1.759921,...,1.639394,1.953202,2.192831,1.997497,1.785865,1.936763,1.779354,2.067547,1.789915,1.818087
AFFX-TrpnX-5_at,2.987895,3.102293,2.490438,2.675669,2.576297,2.383140,2.901501,2.727772,2.470153,2.481942,...,2.368835,2.680062,2.485163,2.631088,2.772938,2.350664,2.557110,2.598819,2.767382,2.491720


In [7]:
GSE43358_GFS = perform_GFS(GSE43358_withoutQN)

In [8]:
GSE43358_GFS = GSE43358_GFS.T

In [50]:
GSE43358_GFS.T.to_csv("dataframe_files/GSE43358_GFS_python.csv")

Arrange the sample IDs of metadata so that the order is consistent with GFS data frame.

In [23]:
metadata = metadata.loc[GSE43358_GFS.index, ]

In [26]:
metadata.head()

Unnamed: 0,title,triple_negative_status,subtype,er,her2,pgr,submission_date,last_update_date
GSM1060909,HER2-13,not TN,HER2,1,1,1,Jan 08 2013,Nov 22 2014
GSM1060910,HER2-14,not TN,HER2,1,1,1,Jan 08 2013,Nov 22 2014
GSM1060911,HER2-15,not TN,HER2,1,1,1,Jan 08 2013,Nov 22 2014
GSM1060912,HER2-16,not TN,HER2,1,1,0,Jan 08 2013,Nov 22 2014
GSM1060913,HER2-18,not TN,HER2,1,1,0,Jan 08 2013,Nov 22 2014


In [27]:
# Define two groups of breast cancer cells
TNBC = GSE43358_GFS.loc[metadata['triple_negative_status'] == 'TN']
not_TNBC = GSE43358_GFS.loc[metadata['triple_negative_status'] == 'not TN']

In [28]:
TNBC

ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
GSM1060950,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210722,0.0,...,1.0,1.0,1.0,1.0,0.198284,0.0,0.0,0.0,0.0,0.0
GSM1060951,0.947086,0.822347,0.0,0.0,0.0,0.0,0.0,0.0,0.442459,0.0,...,1.0,1.0,1.0,1.0,0.431119,0.0,0.0,0.0,0.0,0.0
GSM1060952,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
GSM1060953,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.511047,0.0,...,1.0,1.0,1.0,1.0,0.182006,0.0,0.0,0.0,0.0,0.0
GSM1060954,0.795277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.526594,0.0,...,1.0,1.0,1.0,1.0,0.466968,0.0,0.01209,0.0,0.0,0.0
GSM1060955,0.940136,0.053426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
GSM1060956,1.0,0.026539,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
GSM1060957,0.80113,0.227366,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.408988,0.0,0.0,0.0,0.0,0.0
GSM1060958,1.0,0.237974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
GSM1060959,0.977265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
not_TNBC

ID,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
GSM1060909,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.970315,0.115795,0.611644,0.0,0.0,0.0
GSM1060910,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053426,0.0,...,1.0,1.0,1.0,1.0,0.663039,0.0,0.0,0.0,0.0,0.0
GSM1060911,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.974339,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
GSM1060912,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
GSM1060913,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.480137,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
GSM1060914,1.0,0.087351,0.0,0.0,0.0,0.0,0.0,0.0,0.258029,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
GSM1060915,1.0,0.0,0.187493,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.288272,0.703095,0.0,0.0,0.0
GSM1060916,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
GSM1060917,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
GSM1060918,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189688,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Mann Whitney U-test & DEG Analysis (GSE43358)

In [33]:
import statsmodels.api as sm

In [34]:
GSE43358_result= pd.DataFrame({'Probe': [], 'p-value': [], 'q_value': [], 'DEG': []})
alpha = 0.05

In [35]:
#perform Mann Whiteney U-test (GSE76275)

U1, p = mannwhitneyu(TNBC, not_TNBC)
cols = TNBC.columns
rejected, q_value = fdrcorrection(p) # Benjaminin-HochBerg (FDR)

In [41]:
# Filter at p_adjusted lesser than 0.05
for i, col in enumerate(cols):
    #print(f'{col}: t = {t[i]:.5f}, with p-value = {p[i]:.5f}')
    if q_value[i] <= alpha:
    #    print ('We reject the null hypothesis H0. So, this gene is significantly differentially expressed.')
        a = 1
    else:
    #    print ('We do not reject the null hypothesis H0. So, this gene is  not significantly differentially expressed.')
        a = 0
    GSE43358_result= GSE43358_result.append({'Probe':col, 'q_value': q_value[i], 'p-value':p[i], 'DEG':a},ignore_index=True)
    if i % 1000 == 0:
        print(f"Finished processing {i+1} records")

Finished processing 1 records
Finished processing 1001 records
Finished processing 2001 records
Finished processing 3001 records
Finished processing 4001 records
Finished processing 5001 records
Finished processing 6001 records
Finished processing 7001 records
Finished processing 8001 records
Finished processing 9001 records
Finished processing 10001 records
Finished processing 11001 records
Finished processing 12001 records
Finished processing 13001 records
Finished processing 14001 records
Finished processing 15001 records
Finished processing 16001 records
Finished processing 17001 records
Finished processing 18001 records
Finished processing 19001 records
Finished processing 20001 records
Finished processing 21001 records
Finished processing 22001 records
Finished processing 23001 records
Finished processing 24001 records
Finished processing 25001 records
Finished processing 26001 records
Finished processing 27001 records
Finished processing 28001 records
Finished processing 29001 r

In [42]:
GSE43358_result

Unnamed: 0,Probe,p-value,q_value,DEG
0,1007_s_at,0.344273,1.00000,0.0
1,1053_at,0.003074,0.09727,0.0
2,117_at,0.539498,1.00000,0.0
3,121_at,1.000000,1.00000,0.0
4,1255_g_at,1.000000,1.00000,0.0
...,...,...,...,...
54670,AFFX-ThrX-5_at,0.761706,1.00000,0.0
54671,AFFX-ThrX-M_at,0.675973,1.00000,0.0
54672,AFFX-TrpnX-3_at,1.000000,1.00000,0.0
54673,AFFX-TrpnX-5_at,1.000000,1.00000,0.0


# Mann Whitney U-test DEGs (GSE43358)

In [43]:
# Rank and sort statistical test result with p-value
GSE43358_result_DEG = GSE43358_result[GSE43358_result['DEG'] == 1]

In [44]:
GSE43358_result_DEG = GSE43358_result_DEG.set_index('Probe')

In [45]:
GSE43358_result_DEG

Unnamed: 0_level_0,p-value,q_value,DEG
Probe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1552611_a_at,2.412969e-05,0.004472,1.0
1552619_a_at,3.178753e-09,0.000058,1.0
1552643_at,4.048794e-04,0.025562,1.0
1553589_a_at,3.257750e-04,0.023162,1.0
1553613_s_at,3.281262e-06,0.001646,1.0
...,...,...,...
41660_at,2.654521e-05,0.004755,1.0
43977_at,2.269013e-04,0.018137,1.0
48808_at,8.325236e-04,0.041875,1.0
55081_at,3.536708e-04,0.024425,1.0


In [47]:
import os  
os.makedirs('dataframe_files/GFS_results/', exist_ok=True)  
GSE43358_result_DEG.to_csv('dataframe_files/GFS_results/GSE43358_DEG.csv')