In [86]:
### We want to find downstream substrates of TORC, GTR, Pib2 proteins.
### T tests across the different conditions will show significant connections.
### We approximate the "null distribution" by randomly scrambling the data.
### The FDR obtained from the scrambled data is used to set the corrected threshold.

import pandas as pd
import os
from scipy.stats import ttest_ind
import statsmodels.stats.multitest as multi
import numpy as np

# Welch's T test
def perform_welchs_ttest(rowdata: pd.Series, threshold: float = 0.05):
    '''
    Args: 
    rowdata: 4 treatments in quadruplicate (Treatments: WT, Torc off, Pib2 off, GTR off)
    threshold: for determining T-test significance
    
    Returns a tuple: (list of p-values, True if any p-value < threshold, False otherwise)
    e.g. ( [.01, 0.82, 0.765])
    '''
    # Reshape the rowdata into a 4x4 matrix
    conditions  = [rowdata.iloc[i:i+4].astype(float) 
                   for i in range(0, 16, 4)]
    # t test between WT condition and 3 treatments: pvalues = [ pv1 , pv2 , pv3 ]
    pvalues = [float(ttest_ind(conditions[0], conditions[i], equal_var=False).pvalue) 
               for i in range(1, len(conditions))]
    # Check if any p-value is less than the threshold
    significant = any(p < threshold for p in pvalues)
    
    return pvalues, str(significant)

# Scrambliung dataframe
def scramble(df: pd.DataFrame , seed: int):
    '''
    Returns a scrambled dataframe.
    '''
    np.random.seed(seed)

    # flatten into 1D list
    flat_list  = df.to_numpy().flatten()
    # scramble list
    np.random.shuffle(flat_list)
    # reconstruct dataframe
    n=len(df.axes[0]) # no. rows
    m=len(df.axes[1]) # no. columns

    scrambled_df = pd.DataFrame(flat_list.reshape(n,m),
                    columns = df.keys())
    
    return scrambled_df

In [87]:
# open and convert the file to a Dataframe
# always run from pka_stuff directory
filepath = './ms_data/230403gtroub2expt_working.xlsx'
full_data = pd.read_excel(filepath)

In [130]:
ms_subset = full_data.loc[:,"WT_SD_1":"GTRKO_SD_4"]

def count_significant(tup):
    return "True" in tup[1]
n = len(ms_subset)
fdr_x1000 = list()
for i in range(0,1):
    scrambled_df = scramble(ms_subset[127:129], seed = i)
    
    # T-test applied to all rows of data frame
    results = scrambled_df.apply(perform_welchs_ttest, threshold = .05, axis = 1)

    # Count false discovery rate
    significant_count = results.apply(count_significant).sum()
    fdr_x1000[i] = significant_count/n

# np.mean(fdr_x1000)

([0.31550455562083773, 0.5051192739773831, 0.34902668294780137], 'True')
([0.6806487181585053, 0.5952260267658264, 0.5509561669849219], 'True')
2
