In [80]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

# Load the baseline, deleterious, and v617f datasets
baseline_file = 'baseline/gene_mean_expression.tsv'
deleterious_file = 'Deleterious/deleterious.tsv'
v617f_file = 'V617F/v617f.tsv'

# Load data with gene_id as string and other columns as float
baseline_df = pd.read_csv(baseline_file, sep='\s+', dtype={'gene_id': str}, engine='python')
deleterious_df = pd.read_csv(deleterious_file, sep='\s+', dtype={'gene_id': str}, engine='python')
v617f_df = pd.read_csv(v617f_file, sep='\s+', dtype={'gene_id': str}, engine='python')

# Convert other columns to float
for df in [baseline_df, deleterious_df, v617f_df]:
    df.iloc[:, 1:] = df.iloc[:, 1:].astype(float)

# Merge baseline with deleterious and v617f datasets on gene_id
merged_deleterious = pd.merge(baseline_df, deleterious_df, on='gene_id', suffixes=('_baseline', '_deleterious'))
merged_v617f = pd.merge(baseline_df, v617f_df, on='gene_id', suffixes=('_baseline', '_v617f'))

# Calculate average expression for deleterious and v617f
merged_deleterious['avg_deleterious'] = merged_deleterious.iloc[:, 3:].mean(axis=1)
merged_v617f['avg_v617f'] = merged_v617f.iloc[:, 3:].mean(axis=1)

# Perform differential expression (difference between average deleterious/v617f and baseline)
merged_deleterious['diff_expression_deleterious'] = merged_deleterious['avg_deleterious'] - merged_deleterious['trimmed_mean_expression']
merged_v617f['diff_expression_v617f'] = merged_v617f['avg_v617f'] - merged_v617f['trimmed_mean_expression']

# Calculate Z-scores for differential expression
merged_deleterious['zscore_deleterious'] = zscore(merged_deleterious['diff_expression_deleterious'])
merged_v617f['zscore_v617f'] = zscore(merged_v617f['diff_expression_v617f'])

# Consistency check (remove genes with high variance across cell lines)
deleterious_variance = merged_deleterious.iloc[:, 3:-3].var(axis=1)
v617f_variance = merged_v617f.iloc[:, 3:-3].var(axis=1)

# Filter based on variance consistency
consistent_deleterious = merged_deleterious[(deleterious_variance <= np.percentile(deleterious_variance, 85))]
consistent_v617f = merged_v617f[(v617f_variance <= np.percentile(v617f_variance, 75))]

# Display the most differentially expressed genes based on Z-scores
top_deleterious = consistent_deleterious.sort_values(by='zscore_deleterious', ascending=False).head(10)
top_v617f = consistent_v617f.sort_values(by='zscore_v617f', ascending=False).head(10)

print("Top Differentially Expressed Genes in Deleterious Mutation:")
print(top_deleterious[['gene_id', 'diff_expression_deleterious', 'zscore_deleterious']])

print("\nTop Differentially Expressed Genes in V617F Mutation:")
print(top_v617f[['gene_id', 'diff_expression_v617f', 'zscore_v617f']])


Top Differentially Expressed Genes in Deleterious Mutation:
                  gene_id  diff_expression_deleterious  zscore_deleterious
8393   ENSG00000143434.16                    13.157091            0.018211
12062  ENSG00000167291.16                    12.817516            0.017711
1568   ENSG00000082458.12                    12.676184            0.017503
11708  ENSG00000165905.18                    12.157907            0.016741
12819  ENSG00000170430.10                    11.030511            0.015083
16993  ENSG00000196141.14                    11.009963            0.015053
2731   ENSG00000102935.12                    10.842523            0.014806
128    ENSG00000006194.10                    10.640413            0.014509
6797   ENSG00000133818.14                    10.537445            0.014358
10137  ENSG00000157800.18                    10.112622            0.013733

Top Differentially Expressed Genes in V617F Mutation:
                  gene_id  diff_expression_v617f  zscore_v61