In [5]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
deleterious_df = pd.read_csv('deleterious/deleterious.tsv', sep='\t')
v617f_df = pd.read_csv('V617F/v617f.tsv', sep='\t')
mean_expression_df = pd.read_csv('baseline/gene_mean_expression.tsv', sep='\t')

# Display the first few rows to confirm data loading
print(deleterious_df.head())
print(v617f_df.head())
print(mean_expression_df.head())


                                                                                                                                                                                                                                                                                 gene_id  \
ENSG00000000003.15 0.0224  0.1596  0.0436  0.0105  0.0114  0.0259  22.7063  27.1982  0.0392  0.0466  0.3382  42.8696 0.2622  0.0600  0.0140  0.5801  0.0965  0.0066  5.0034  47.9463  28.2967  0.0584  0.0114  0.0402  0.0227  0.1652  52.0545 0.0458  19.2818 2.0081  0.1974    42.3898   
ENSG00000000005.6  0.0000  0.0000  0.0670  0.0000  0.0000  0.0000  1.1606   0.7978   0.0000  0.0000  0.0000  1.0352  0.0000  0.0000  0.0431  0.0232  0.0000  0.0000  4.2876  0.0753   0.5936   0.0000  0.0000  0.0000  0.0000  0.0000  1.0370  0.0000  0.6548  7.2733  0.0000     0.2202   
ENSG00000000419.13 41.8830 57.9216 60.0189 53.9261 69.5277 59.3909 111.2332 112.6444 48.2843 46.6481 23.7400 95.7022 71.0551 71.6603 37.8289 80.1236

In [6]:
# Extract gene IDs and corresponding expression values
deleterious_genes = deleterious_df.set_index('gene_id')
v617f_genes = v617f_df.set_index('gene_id')
mean_expression = mean_expression_df.set_index('gene_id')

# Extract the average expression columns for comparisons
deleterious_avg = deleterious_genes['average']
v617f_avg = v617f_genes['average']


In [7]:
# Calculate Fold Change: Deleterious vs Mean, V617F vs Mean
deleterious_fold_change = deleterious_avg / mean_expression['trimmed_mean_expression']
v617f_fold_change = v617f_avg / mean_expression['trimmed_mean_expression']

# Perform t-tests between deleterious and mean expression, and v617f and mean expression
p_values_deleterious = []
p_values_v617f = []

for gene in mean_expression.index:
    deleterious_values = deleterious_genes.loc[gene].drop('average').values
    v617f_values = v617f_genes.loc[gene].drop('average').values
    
    # Using trimmed mean expression as the control
    mean_value = mean_expression.loc[gene, 'trimmed_mean_expression']
    
    # Perform t-test (comparing deleterious/v617f to the mean expression)
    t_stat_deleterious, p_val_deleterious = ttest_ind(deleterious_values, [mean_value]*len(deleterious_values))
    t_stat_v617f, p_val_v617f = ttest_ind(v617f_values, [mean_value]*len(v617f_values))
    
    p_values_deleterious.append(p_val_deleterious)
    p_values_v617f.append(p_val_v617f)

# Convert to numpy arrays for easier handling
p_values_deleterious = np.array(p_values_deleterious)
p_values_v617f = np.array(p_values_v617f)


KeyError: 'ENSG00000000003.15'