# Step 1: Gene filtering (by majority rule and KS test)

Among the replicates of each cell type sample, there are 4 possibilities when it comes to the expression of the replicates. These possibilities are illustrated in the table below. 

|gene| replicate1  |   replicate2 |   replicate3 |
|---|---|---|---|
| A | 0 | 0 | X |
| B | 0 | X | Y |
| C | 0 | 0 | 0 | 
| D | X | Y | Z | 


First, we will filter the genes based on the **majority rule**, i.e. if the gene has 2 replicates with expression 0, we consider that this gene is not expressed in this cell type condition. Next, in the case where there is one zero replicate (case B), we will compute the mean of the values and consider that mean the expression of the gene in the given cell-type condition. In the case where all three replicates are 0, the gene will be considered as not expressed, while if all the replicates are different from zero the we will compute the mean as described in case B. 


After this filtering is done, we proceed by filtering by the **KS statistical test**.

In [1]:
import pandas as pd
import itertools
import numpy as np
from pandas import DataFrame
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

## 0 preprocessing the dataframe

In [2]:
#loading the dataset
df_orig= pd.read_csv("rpkm.tsv", sep="\t")
df1 = df_orig.rename(columns={"Unnamed: 0": "geneID"})
df = df1.loc[(df1.drop(columns=['geneID'])!=0).any(1)]
df = df.reset_index(drop=True)
df

Unnamed: 0,geneID,G7F_1,G7F_2,G7F_3,G7FNSen_1,G7FNSen_2,G7FNSen_3,G7FSen_1,G7FSen_2,G7FSen_3,...,Y3MSen_2,Y3MSen_3,Y3S_1,Y3S_3,Y3SNSen_1,Y3SNSen_2,Y3SNSen_3,Y3SSen_1,Y3SSen_2,Y3SSen_3
0,ENSMUSG00000102693,0.000000,0.000000,0.000000,0.089915,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,ENSMUSG00000051951,0.000000,0.000000,0.000000,0.015788,0.000000,0.00000,0.000000,0.149587,0.010725,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ENSMUSG00000102851,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ENSMUSG00000103377,0.000000,0.000000,0.000000,0.000000,0.337747,0.00000,0.000000,0.291033,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.379319
4,ENSMUSG00000104017,0.113753,0.000000,0.000000,0.043085,0.038762,0.00000,0.043893,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38356,ENSMUSG00000095134,0.356184,0.431285,0.425738,0.308364,0.138711,0.55575,0.628293,1.606954,2.094807,...,2.260534,2.603673,0.000000,0.05487,0.327509,0.000000,0.055762,0.093670,0.290747,0.631334
38357,ENSMUSG00000095366,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.295010,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
38358,ENSMUSG00000096768,12.090350,10.192970,13.033180,9.672305,5.621485,10.28435,8.894480,9.813282,3.256269,...,6.072379,2.825208,8.710045,13.58579,4.581859,4.040346,1.485923,7.644267,8.577849,6.058498
38359,ENSMUSG00000099871,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [3]:
def compute_mean(df, *args):
    """
    Function that takes 1 argument that is the dataframe and
    2 or 3 arguments that represent column names of given dataframe
    and computes the mean of each row for the given columns.
    """
    mean_list = []
    for row in df.iterrows():
        
        if len(args) == 3:
            col1 = args[0]
            col2 = args[1]
            col3 = args[2]
            #print((df[[col1, col2, col3]].loc[row[0]].values))  
        
            #access values corresponding to each row for the given 3 columns
            values = df[[col1, col2, col3]].loc[row[0]].values
            #print('values', values)
            zeros = len(values) - np.count_nonzero(values)
            #print('zeros',zeros)

            if zeros == 0:      
                mean = values.mean()
                mean_list.append(mean)
                #print(mean)
            if zeros == 1:
                selected_values = values[values != 0] # find elements that are non-zero
                #print(selected_values)
                mean = selected_values.mean()
                #print(mean)
                mean_list.append(mean)

            if zeros == 2 or zeros == 3: #in the case there is only one non-zero value or if all the values are zero
                mean = 0
                #print(mean)
                mean_list.append(mean)
        
        if len(args) == 2:
            col1 = args[0]
            col2 = args[1]
            
            values = df[[col1, col2]].loc[row[0]].values
            #print('values', values)
            zeros = len(values) - np.count_nonzero(values)
            #print('zeros',zeros)

            if zeros == 0:      
                mean = values.mean()
                #print(mean)
                mean_list.append(mean)

            if zeros == 1:
                selected_values = values[values != 0] # find elements that are non-zero
                mean = selected_values[0]
                #print(mean)
                mean_list.append(mean)

            if zeros == 2: #in the case there is only one non-zero value or if all the values are zero
                mean = 0
                #print(mean)
                mean_list.append(mean)
            
            
    return mean_list      

In [4]:
def get_mean(df):
    """
    Function that imports the datafame and sepearates the individial replicates within one sample
    into a list of lists (e.g. [[G7S_1,G7S_2,G7S_3],[...],[...],...]).
    It returns the dataframe with the mean of each replicate.
    """
    import math
    
    #because first column is the gene name that we don't need for parsing right now 
    list_col = list(df)[1::]
    
    
    #store elements in a list of lists
    ll_col = [list_col[3*i:3*i+3]  for i in range(0,math.ceil(len(list_col)/3)) ]

    l = []
    count = 4
    #print(ll_col)
    for elem in ll_col:

        #computing stdev
        #std = compute_std(df, elem[0], elem[1], elem[2])
        #df.insert(loc=count, column=str(elem[0][:-1])+'std', value=std)
        #elem.append(str(elem[0][:-1])+'std')
        
        #computing mean
        mean = compute_mean(df, elem[0], elem[1], elem[2])
        #print('mean++++', mean)
        df.insert(loc=count, column=str(elem[0][:-1])+'mean', value=mean)
        elem.append(str(elem[0][:-1])+'mean')
        
        count +=4
    return df
    

In [5]:
def get_mean2(df):
    """
    Computes meand of the geneset for sample Y3S that has 2 replicates instead of 3.
    """
    
    df["Y3S_mean"] = compute_mean(df, 'Y3S_1','Y3S_3')

    return df
    

In [9]:
df_3rep = df.drop(columns=['Y3S_1', 'Y3S_3'])
df_3rep

Unnamed: 0,geneID,G7F_1,G7F_2,G7F_3,G7FNSen_1,G7FNSen_2,G7FNSen_3,G7FSen_1,G7FSen_2,G7FSen_3,...,Y3MNSen_3,Y3MSen_1,Y3MSen_2,Y3MSen_3,Y3SNSen_1,Y3SNSen_2,Y3SNSen_3,Y3SSen_1,Y3SSen_2,Y3SSen_3
0,ENSMUSG00000102693,0.000000,0.000000,0.000000,0.089915,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,ENSMUSG00000051951,0.000000,0.000000,0.000000,0.015788,0.000000,0.00000,0.000000,0.149587,0.010725,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ENSMUSG00000102851,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ENSMUSG00000103377,0.000000,0.000000,0.000000,0.000000,0.337747,0.00000,0.000000,0.291033,0.000000,...,0.042689,0.130653,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.379319
4,ENSMUSG00000104017,0.113753,0.000000,0.000000,0.043085,0.038762,0.00000,0.043893,0.000000,0.000000,...,0.053892,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38356,ENSMUSG00000095134,0.356184,0.431285,0.425738,0.308364,0.138711,0.55575,0.628293,1.606954,2.094807,...,1.735696,2.065848,2.260534,2.603673,0.327509,0.000000,0.055762,0.093670,0.290747,0.631334
38357,ENSMUSG00000095366,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.295010,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
38358,ENSMUSG00000096768,12.090350,10.192970,13.033180,9.672305,5.621485,10.28435,8.894480,9.813282,3.256269,...,69.699630,6.444352,6.072379,2.825208,4.581859,4.040346,1.485923,7.644267,8.577849,6.058498
38359,ENSMUSG00000099871,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [10]:
#instead of just selecting the columns from the original dataframe, 
#we are creatign a deepcopy of the columns that wewant to select
df_2rep = df[['Y3S_1','Y3S_3']].copy(deep = True)
df_2rep

Unnamed: 0,Y3S_1,Y3S_3
0,0.000000,0.00000
1,0.000000,0.00000
2,0.000000,0.00000
3,0.000000,0.00000
4,0.000000,0.00000
...,...,...
38356,0.000000,0.05487
38357,0.000000,0.00000
38358,8.710045,13.58579
38359,0.000000,0.00000


In [11]:
df_2rep = get_mean2(df_2rep)
df_2rep

Unnamed: 0,Y3S_1,Y3S_3,Y3S_mean
0,0.000000,0.00000,0.000000
1,0.000000,0.00000,0.000000
2,0.000000,0.00000,0.000000
3,0.000000,0.00000,0.000000
4,0.000000,0.00000,0.000000
...,...,...,...
38356,0.000000,0.05487,0.054870
38357,0.000000,0.00000,0.000000
38358,8.710045,13.58579,11.147917
38359,0.000000,0.00000,0.000000


In [12]:
df_3rep = get_mean(df_3rep)
df_3rep

Unnamed: 0,geneID,G7F_1,G7F_2,G7F_3,G7F_mean,G7FNSen_1,G7FNSen_2,G7FNSen_3,G7FNSen_mean,G7FSen_1,...,Y3MSen_3,Y3MSen_mean,Y3SNSen_1,Y3SNSen_2,Y3SNSen_3,Y3SNSen_mean,Y3SSen_1,Y3SSen_2,Y3SSen_3,Y3SSen_mean
0,ENSMUSG00000102693,0.000000,0.000000,0.000000,0.000000,0.089915,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,ENSMUSG00000051951,0.000000,0.000000,0.000000,0.000000,0.015788,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ENSMUSG00000102851,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ENSMUSG00000103377,0.000000,0.000000,0.000000,0.000000,0.000000,0.337747,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.379319,0.000000
4,ENSMUSG00000104017,0.113753,0.000000,0.000000,0.000000,0.043085,0.038762,0.00000,0.040924,0.043893,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38356,ENSMUSG00000095134,0.356184,0.431285,0.425738,0.404402,0.308364,0.138711,0.55575,0.334275,0.628293,...,2.603673,2.310018,0.327509,0.000000,0.055762,0.191635,0.093670,0.290747,0.631334,0.338584
38357,ENSMUSG00000095366,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
38358,ENSMUSG00000096768,12.090350,10.192970,13.033180,11.772167,9.672305,5.621485,10.28435,8.526047,8.894480,...,2.825208,5.113980,4.581859,4.040346,1.485923,3.369376,7.644267,8.577849,6.058498,7.426871
38359,ENSMUSG00000099871,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [13]:
#add together the two dataframes
df_together = df_3rep.join(df_2rep)
df_together

Unnamed: 0,geneID,G7F_1,G7F_2,G7F_3,G7F_mean,G7FNSen_1,G7FNSen_2,G7FNSen_3,G7FNSen_mean,G7FSen_1,...,Y3SNSen_2,Y3SNSen_3,Y3SNSen_mean,Y3SSen_1,Y3SSen_2,Y3SSen_3,Y3SSen_mean,Y3S_1,Y3S_3,Y3S_mean
0,ENSMUSG00000102693,0.000000,0.000000,0.000000,0.000000,0.089915,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
1,ENSMUSG00000051951,0.000000,0.000000,0.000000,0.000000,0.015788,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
2,ENSMUSG00000102851,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
3,ENSMUSG00000103377,0.000000,0.000000,0.000000,0.000000,0.000000,0.337747,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.379319,0.000000,0.000000,0.00000,0.000000
4,ENSMUSG00000104017,0.113753,0.000000,0.000000,0.000000,0.043085,0.038762,0.00000,0.040924,0.043893,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38356,ENSMUSG00000095134,0.356184,0.431285,0.425738,0.404402,0.308364,0.138711,0.55575,0.334275,0.628293,...,0.000000,0.055762,0.191635,0.093670,0.290747,0.631334,0.338584,0.000000,0.05487,0.054870
38357,ENSMUSG00000095366,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
38358,ENSMUSG00000096768,12.090350,10.192970,13.033180,11.772167,9.672305,5.621485,10.28435,8.526047,8.894480,...,4.040346,1.485923,3.369376,7.644267,8.577849,6.058498,7.426871,8.710045,13.58579,11.147917
38359,ENSMUSG00000099871,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000


## 1 Filtering by Majority rule

In [14]:
#filter only the columns containing the mean
df_mean = df_together.filter(regex='mean|geneID')
df_mean

Unnamed: 0,geneID,G7F_mean,G7FNSen_mean,G7FSen_mean,G7M_mean,G7MNSen_mean,G7MSen_mean,G7S_mean,G7SNSen_mean,G7SSen_mean,...,G3SSen_mean,Y3F_mean,Y3FNSen_mean,Y3FSen_mean,Y3M_mean,Y3MNSen_mean,Y3MSen_mean,Y3SNSen_mean,Y3SSen_mean,Y3S_mean
0,ENSMUSG00000102693,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.186836,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,ENSMUSG00000051951,0.000000,0.000000,0.080156,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.261546,0.000000,0.000000,0.092896,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ENSMUSG00000102851,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ENSMUSG00000103377,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.220484,0.228021,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,ENSMUSG00000104017,0.000000,0.040924,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.171770,...,0.777498,0.000000,0.174468,0.233775,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38356,ENSMUSG00000095134,0.404402,0.334275,1.443351,1.716083,0.921423,2.148395,0.178802,0.119666,0.230058,...,1.453814,0.536784,1.080802,1.353583,8.453523,1.828105,2.310018,0.191635,0.338584,0.054870
38357,ENSMUSG00000095366,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.377885,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
38358,ENSMUSG00000096768,11.772167,8.526047,7.321344,9.974112,10.448339,18.351263,7.153871,5.771331,8.497301,...,10.014858,6.991303,3.271075,5.317420,22.312437,43.878690,5.113980,3.369376,7.426871,11.147917
38359,ENSMUSG00000099871,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [15]:
#select from df_mean only genes that are non-zero in all cell-types
df_nonzero_means = df_mean.loc[(df_mean.drop(columns=['geneID'])!=0).any(1)]
df_nonzero_means

Unnamed: 0,geneID,G7F_mean,G7FNSen_mean,G7FSen_mean,G7M_mean,G7MNSen_mean,G7MSen_mean,G7S_mean,G7SNSen_mean,G7SSen_mean,...,G3SSen_mean,Y3F_mean,Y3FNSen_mean,Y3FSen_mean,Y3M_mean,Y3MNSen_mean,Y3MSen_mean,Y3SNSen_mean,Y3SSen_mean,Y3S_mean
0,ENSMUSG00000102693,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.186836,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,ENSMUSG00000051951,0.000000,0.000000,0.080156,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.261546,0.000000,0.000000,0.092896,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ENSMUSG00000103377,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.220484,0.228021,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,ENSMUSG00000104017,0.000000,0.040924,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.171770,...,0.777498,0.000000,0.174468,0.233775,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,ENSMUSG00000103025,0.000000,0.000000,0.546004,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38348,ENSMUSG00000100637,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.100260,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
38351,ENSMUSG00000102045,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.114366,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
38356,ENSMUSG00000095134,0.404402,0.334275,1.443351,1.716083,0.921423,2.148395,0.178802,0.119666,0.230058,...,1.453814,0.536784,1.080802,1.353583,8.453523,1.828105,2.310018,0.191635,0.338584,0.054870
38357,ENSMUSG00000095366,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.377885,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [16]:
df_nonzero_reps = df[df['geneID'].isin(list(df_nonzero_means['geneID']))]
df_nonzero_reps

Unnamed: 0,geneID,G7F_1,G7F_2,G7F_3,G7FNSen_1,G7FNSen_2,G7FNSen_3,G7FSen_1,G7FSen_2,G7FSen_3,...,Y3MSen_2,Y3MSen_3,Y3S_1,Y3S_3,Y3SNSen_1,Y3SNSen_2,Y3SNSen_3,Y3SSen_1,Y3SSen_2,Y3SSen_3
0,ENSMUSG00000102693,0.000000,0.000000,0.000000,0.089915,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,ENSMUSG00000051951,0.000000,0.000000,0.000000,0.015788,0.000000,0.00000,0.000000,0.149587,0.010725,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ENSMUSG00000103377,0.000000,0.000000,0.000000,0.000000,0.337747,0.00000,0.000000,0.291033,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.379319
4,ENSMUSG00000104017,0.113753,0.000000,0.000000,0.043085,0.038762,0.00000,0.043893,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,ENSMUSG00000103025,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.157918,0.934090,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.073121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38348,ENSMUSG00000100637,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
38351,ENSMUSG00000102045,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.062580,0.000000,0.000000
38356,ENSMUSG00000095134,0.356184,0.431285,0.425738,0.308364,0.138711,0.55575,0.628293,1.606954,2.094807,...,2.260534,2.603673,0.000000,0.05487,0.327509,0.000000,0.055762,0.093670,0.290747,0.631334
38357,ENSMUSG00000095366,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.295010,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [17]:
df_nonzero_means.to_csv("df_nonzero_means.csv", sep = "\t", index = False)
df_nonzero_reps.to_csv("df_nonzero_reps.csv", sep = "\t", index = False)

We have now obtained a dataframe with 28603 genes. Now, the goal is to filter out the gens that have only few conditions that are non-zero as well as genes that have almost constant expression among the conditions.

The dataframe df_nonzero_means contains only the means of the replicates for each sample, while the df_nonzero_reps contains only the replicates of each sampple.

## 2 Filtering by Kolmogorov-Smirnov test

We can import directly the dataframes df_nonzero_means and df_nonzero_reps without executing the sections 0 and 1 again. 

In [6]:
df_nonzero_means = pd.read_csv("df_nonzero_means.csv", sep="\t")
df_nonzero_reps = pd.read_csv("df_nonzero_reps.csv", sep="\t")

In [7]:
# put all sen first, all non-sen and basal last
dsen = df_nonzero_reps.filter(regex='FSen|SSen|MSen')

In [8]:
dnsen = df_nonzero_reps.filter(regex='F_|S_|M_|FNSen|SNSen|MNSen')

In [9]:
dtest = dsen.join(dnsen)
dtest

Unnamed: 0,G7FSen_1,G7FSen_2,G7FSen_3,G7MSen_1,G7MSen_2,G7MSen_3,G7SSen_1,G7SSen_2,G7SSen_3,Y7FSen_1,...,Y3M_2,Y3M_3,Y3MNSen_1,Y3MNSen_2,Y3MNSen_3,Y3S_1,Y3S_3,Y3SNSen_1,Y3SNSen_2,Y3SNSen_3
0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.040192,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
1,0.000000,0.149587,0.010725,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
2,0.000000,0.291033,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.042689,0.000000,0.00000,0.000000,0.000000,0.000000
3,0.043893,0.000000,0.000000,0.000000,0.00000,0.000000,0.320607,0.022932,0.000000,0.056587,...,0.000000,0.00000,0.000000,0.000000,0.053892,0.000000,0.00000,0.000000,0.000000,0.000000
4,0.000000,0.157918,0.934090,0.000000,0.19850,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28598,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
28599,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
28600,0.628293,1.606954,2.094807,1.978322,2.05664,2.410222,0.573650,0.082064,0.034460,0.202498,...,6.781567,10.79994,1.159785,2.588835,1.735696,0.000000,0.05487,0.327509,0.000000,0.055762
28601,0.000000,0.295010,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000


In [20]:
def kstest_function(series):
    """
    Function that applies to the dataframe that has sen first and nsen ans basal second in the oder
    (like the test df). It makes a KS test and appends the pval as last column to the dataframe.
    """
    series = series.copy()
    sen = series.iloc[0:36].to_numpy()
    nsen = series.iloc[36:].to_numpy()
    pval = stats.ks_2samp(sen,nsen)[1]
    return pval

In [11]:
dtest["pval"] = dtest.apply(kstest_function, axis = 1)

In [12]:
dtest

Unnamed: 0,G7FSen_1,G7FSen_2,G7FSen_3,G7MSen_1,G7MSen_2,G7MSen_3,G7SSen_1,G7SSen_2,G7SSen_3,Y7FSen_1,...,Y3M_3,Y3MNSen_1,Y3MNSen_2,Y3MNSen_3,Y3S_1,Y3S_3,Y3SNSen_1,Y3SNSen_2,Y3SNSen_3,pval
0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.040192,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.787789
1,0.000000,0.149587,0.010725,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.207718
2,0.000000,0.291033,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.042689,0.000000,0.00000,0.000000,0.000000,0.000000,0.957190
3,0.043893,0.000000,0.000000,0.000000,0.00000,0.000000,0.320607,0.022932,0.000000,0.056587,...,0.00000,0.000000,0.000000,0.053892,0.000000,0.00000,0.000000,0.000000,0.000000,0.961734
4,0.000000,0.157918,0.934090,0.000000,0.19850,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.448116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28598,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.987965
28599,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.952338
28600,0.628293,1.606954,2.094807,1.978322,2.05664,2.410222,0.573650,0.082064,0.034460,0.202498,...,10.79994,1.159785,2.588835,1.735696,0.000000,0.05487,0.327509,0.000000,0.055762,0.000121
28601,0.000000,0.295010,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.988595


In [104]:
dtest["pval"]

0        0.787789
1        0.207718
2        0.957190
3        0.961734
4        0.448116
           ...   
28598    0.987965
28599    0.952338
28600    0.000121
28601    0.988595
28602    0.854600
Name: pval, Length: 28603, dtype: float64

In [105]:
(dtest['pval'] <= 0.05).value_counts()

False    21944
True      6659
Name: pval, dtype: int64

In [106]:
(dtest['pval'] <= 1.7480683844351993e-06).value_counts()

False    28461
True       142
Name: pval, dtype: int64

In [59]:
d_new = dtest.loc[dtest['pval'] <=  0.00000174806]

In [60]:
d_new

Unnamed: 0,G7FSen_1,G7FSen_2,G7FSen_3,G7MSen_1,G7MSen_2,G7MSen_3,G7SSen_1,G7SSen_2,G7SSen_3,Y7FSen_1,...,Y3M_3,Y3MNSen_1,Y3MNSen_2,Y3MNSen_3,Y3S_1,Y3S_3,Y3SNSen_1,Y3SNSen_2,Y3SNSen_3,pval
198,41.605348,22.424473,19.442875,19.538764,21.710699,8.776077,24.846800,17.401286,7.725258,53.926559,...,18.707950,37.261751,40.063626,17.624399,8.961181,0.078349,25.346930,23.942590,29.221734,4.746531e-07
223,0.209319,0.486696,0.418738,0.109848,0.342590,0.642383,0.222967,0.410100,0.413294,0.337316,...,0.412892,0.064398,0.046621,0.000000,0.763940,1.316167,0.523734,0.499190,0.594475,1.736938e-07
332,53.868026,37.921313,39.384965,31.888645,24.465477,21.655241,46.122475,26.873790,17.263553,59.301656,...,12.397101,10.356818,36.541529,22.680624,16.737791,7.132162,32.662999,37.677578,44.675259,1.446713e-07
404,8.529271,13.151802,14.650095,11.116139,18.018765,14.438374,10.750335,20.903849,19.416092,12.124321,...,14.777177,11.489677,9.400971,16.872661,34.493303,23.217578,17.301510,24.912220,22.435032,1.339418e-06
512,42.222603,56.378512,68.214869,41.185298,57.749654,55.159112,42.097552,73.276281,63.585467,59.617546,...,64.401794,76.486666,71.692078,75.126180,153.875500,140.670453,56.481799,72.509491,54.583119,7.092105e-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27813,0.000000,0.786209,0.010066,0.000000,0.112943,0.023163,0.284837,0.110413,3.993895,0.184877,...,0.000000,0.000000,0.000000,0.018534,0.000000,0.000000,0.000000,0.000000,0.000000,3.435972e-07
27816,129.323600,74.757880,80.325290,119.386900,70.379010,60.941250,42.316710,27.787190,19.321950,110.706800,...,48.744510,54.698710,59.849670,51.697130,22.475760,30.412210,31.948610,28.071370,33.132020,3.967961e-08
27867,44.554870,31.553460,34.380410,24.970190,20.374190,19.657030,50.919540,33.703390,20.753860,36.886770,...,23.166670,10.914060,26.964390,14.302720,6.300486,8.630346,34.723780,30.023070,37.631460,3.709697e-07
28166,9.840731,5.805248,5.718354,8.764547,6.705446,9.556261,19.749630,15.218390,18.332270,3.298522,...,3.739647,2.941050,9.693953,0.724949,4.061382,3.378682,15.922450,18.378840,21.000990,7.414996e-08


The d_new dataframe contains the 142 genes selected by the KS test. However, this dataframe contains the 3 replicates, now wewant to extract the given indices from the df_nonzero_means dataframe.

In [64]:
df_nonzero_means

Unnamed: 0,geneID,G7F_mean,G7FNSen_mean,G7FSen_mean,G7M_mean,G7MNSen_mean,G7MSen_mean,G7S_mean,G7SNSen_mean,G7SSen_mean,...,G3SSen_mean,Y3F_mean,Y3FNSen_mean,Y3FSen_mean,Y3M_mean,Y3MNSen_mean,Y3MSen_mean,Y3SNSen_mean,Y3SSen_mean,Y3S_mean
0,ENSMUSG00000102693,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.186836,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,ENSMUSG00000051951,0.000000,0.000000,0.080156,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.261546,0.000000,0.000000,0.092896,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ENSMUSG00000103377,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.220484,0.228021,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ENSMUSG00000104017,0.000000,0.040924,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.171770,...,0.777498,0.000000,0.174468,0.233775,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,ENSMUSG00000103025,0.000000,0.000000,0.546004,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28598,ENSMUSG00000100637,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.100260,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
28599,ENSMUSG00000102045,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.114366,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
28600,ENSMUSG00000095134,0.404402,0.334275,1.443351,1.716083,0.921423,2.148395,0.178802,0.119666,0.230058,...,1.453814,0.536784,1.080802,1.353583,8.453523,1.828105,2.310018,0.191635,0.338584,0.054870
28601,ENSMUSG00000095366,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.377885,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [67]:
df_filteredKS = df_nonzero_means.loc[d_new.index, :]

In [68]:
df_filteredKS

Unnamed: 0,geneID,G7F_mean,G7FNSen_mean,G7FSen_mean,G7M_mean,G7MNSen_mean,G7MSen_mean,G7S_mean,G7SNSen_mean,G7SSen_mean,...,G3SSen_mean,Y3F_mean,Y3FNSen_mean,Y3FSen_mean,Y3M_mean,Y3MNSen_mean,Y3MSen_mean,Y3SNSen_mean,Y3SSen_mean,Y3S_mean
198,ENSMUSG00000026154,9.374707,12.772968,27.824232,5.671751,11.260086,16.675180,0.000000,6.299948,16.657781,...,45.174186,21.242280,26.628339,33.862152,21.526737,31.649925,47.899333,26.170418,48.272665,4.519765
223,ENSMUSG00000101372,1.444587,1.111497,0.371584,0.699926,0.330705,0.364940,1.624852,0.486327,0.348787,...,0.114060,0.582884,0.373457,0.367516,0.367600,0.055509,0.210572,0.539133,0.318045,1.040053
332,ENSMUSG00000026087,22.069027,25.173005,43.724768,16.839861,20.386269,26.003121,20.099141,31.424203,30.086606,...,32.878457,19.962933,27.788901,33.128654,15.660406,23.192990,32.742092,38.338612,40.461611,11.934976
404,ENSMUSG00000041763,18.230442,11.950132,12.110390,28.941241,17.719509,14.524426,27.617458,16.755259,17.023425,...,11.825596,12.635614,12.746935,11.025465,15.401941,12.587769,12.309141,21.549588,12.849492,28.855441
512,ENSMUSG00000025982,112.095088,84.476640,55.605328,83.438708,65.120788,51.364688,120.530328,74.634990,59.653100,...,40.601267,87.341269,63.008648,46.958564,61.851565,74.434975,44.904032,61.191470,50.753543,147.272977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27813,ENSMUSG00000031385,0.000000,0.367214,0.398137,0.000000,0.000000,0.068053,0.000000,0.000000,1.463048,...,0.326122,0.000000,0.263454,0.184928,0.000000,0.000000,0.273452,0.000000,0.247827,0.000000
27816,ENSMUSG00000002014,36.531070,61.966370,94.802257,64.016520,57.620877,83.569053,31.541450,29.319517,29.808617,...,85.980737,61.723820,126.512300,155.044933,50.799810,55.415170,88.717490,31.050667,58.336110,26.443985
27867,ENSMUSG00000031198,22.257823,26.709863,36.829580,13.182433,13.808850,21.667137,8.334765,25.366677,35.125597,...,38.596550,38.528550,35.038627,39.704690,20.947253,17.393723,43.456097,34.126103,43.358643,7.465416
28166,ENSMUSG00000025525,1.787333,2.887403,7.121444,3.833947,3.837358,8.342085,3.408360,8.749270,17.766763,...,18.968587,2.519863,12.714293,8.579726,4.221943,4.453317,18.298710,18.434093,41.276830,3.720032


In [69]:
df_filteredKS.to_csv("df_filteredKS.csv", sep = "\t", index = False)

## 3 Randomly resampling the samples from the KS test

In [6]:
df_nonzero_reps = pd.read_csv("df_nonzero_reps.csv", sep="\t")
df_resampling = df_nonzero_reps.drop(columns = 'geneID', axis = 1, inplace= False)
df_resampling

Unnamed: 0,G7F_1,G7F_2,G7F_3,G7FNSen_1,G7FNSen_2,G7FNSen_3,G7FSen_1,G7FSen_2,G7FSen_3,G7M_1,...,Y3MSen_2,Y3MSen_3,Y3S_1,Y3S_3,Y3SNSen_1,Y3SNSen_2,Y3SNSen_3,Y3SSen_1,Y3SSen_2,Y3SSen_3
0,0.000000,0.000000,0.000000,0.089915,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.015788,0.000000,0.00000,0.000000,0.149587,0.010725,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.337747,0.00000,0.000000,0.291033,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.379319
3,0.113753,0.000000,0.000000,0.043085,0.038762,0.00000,0.043893,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.157918,0.934090,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.073121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28598,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
28599,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.062580,0.000000,0.000000
28600,0.356184,0.431285,0.425738,0.308364,0.138711,0.55575,0.628293,1.606954,2.094807,2.126982,...,2.260534,2.603673,0.000000,0.05487,0.327509,0.000000,0.055762,0.093670,0.290747,0.631334
28601,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.295010,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [99]:
df_random36 = df_resampling.sample(n=36,axis='columns')
df_random71 = df_resampling[df_resampling.columns.difference(df_random36.columns)] 

In [100]:
dtest_random = df_random36.join(df_random71)
dtest_random["pval"] = dtest_random.apply(kstest_function, axis = 1)

In [101]:
(dtest_random['pval'] <= 1.7480683844351993e-06).value_counts() 

False    28603
Name: pval, dtype: int64