## Post processing TPM tables to obtain a delta table.

In [65]:
import pandas as pd # for most table operaions
import glob # for getting list of all files in a folder
import os

### Reading multiple TPM tables to get one dataframe

In [76]:
# get list of all processed files
tpm_tbls = glob.glob(os.path.join("..","tpms","*.tsv"))
len(tpm_tbls)

27

In [77]:
# Read all the files and create on TPM table
dfs = list()
all_tpms = pd.DataFrame(columns=["id"])
for f in tpm_tbls:
    samp_name = f.split("/")[-1].split("_")[0]
    data = pd.read_csv(f, sep="\t", names=["id", samp_name], header=0)
    all_tpms = all_tpms.merge(data, how="outer", left_on="id", right_on="id")

all_tpms

Unnamed: 0,id,SRR16676571,SRR16676523,SRR16676563,SRR16676531,SRR16676476,SRR16676562,SRR16676522,SRR16676577,SRR16676528,...,SRR16676492,SRR16676487,SRR16676480,SRR16676602,SRR16676479,SRR16676572,SRR16676575,SRR16676520,SRR16676526,SRR16676534
0,ENSG00000223972,0.000,0.000,0.000,0.000,0.000,0.000,0.101,0.035,0.000,...,0.000,0.000,0.000,0.000,0.000,0.035,0.103,0.000,0.000,0.000
1,ENSG00000227232,0.771,0.762,0.405,1.091,0.437,0.255,1.511,0.772,0.995,...,0.816,0.871,1.001,0.500,1.083,1.003,0.600,0.824,0.988,0.396
2,ENSG00000278267,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
3,ENSG00000243485,0.000,0.000,0.000,0.000,0.031,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.031,0.000,0.000,0.000,0.000,0.000,0.000
4,ENSG00000274890,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60670,ENSG00000275028,0.000,0.000,0.000,0.000,2.323,0.000,0.000,0.000,2.829,...,0.000,0.000,0.961,0.000,0.000,0.000,0.000,0.000,0.000,0.000
60671,ENSG00000278806,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
60672,ENSG00000274152,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
60673,ENSG00000276666,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [78]:
# Read in the metadata table
sra_mdata = pd.read_csv("../SraRunTable.csv", sep=",")
sra_mdata.columns
# Subset pertinent columns only
sub_mdata = sra_mdata[['Donor', 'Run','Experiment','BioSample', 'Condition', 'infection','Time_point']]
sub_mdata

Unnamed: 0,Donor,Run,Experiment,BioSample,Condition,infection,Time_point
0,623950,SRR16676454,SRX12877256,SAMN22818690,NHB,influenza A virus,18 hr
1,103224,SRR16676455,SRX12877255,SAMN22818691,COPD,control,4 hr
2,623950,SRR16676456,SRX12877254,SAMN22818692,NHB,influenza A virus,18 hr
3,436083,SRR16676457,SRX12877253,SAMN22818693,COPD,influenza A virus,18 hr
4,436083,SRR16676458,SRX12877252,SAMN22818694,COPD,influenza A virus,18 hr
...,...,...,...,...,...,...,...
157,626776,SRR16676515,SRX12877195,SAMN22818751,NHB,control,18 hr
158,440551,SRR16676517,SRX12877193,SAMN22818753,COPD,control,18 hr
159,655308,SRR16676562,SRX12877148,SAMN22818798,NHB,influenza A virus,4 hr
160,672447,SRR16676602,SRX12877096,SAMN22818835,NHB,control,4 hr


### Calculate the average TPMs of control samples for each gene

In [79]:
# Get a list of control samples for NHB patient that were collected at 4 hours.
NHB_ctrl_srr = sub_mdata[(sub_mdata['Condition'] == "NHB") & (sub_mdata['infection'] == "control") & (sub_mdata['Time_point'] == "4 hr")]['Run'].to_list()
NHB_ctrl_srr

['SRR16676476',
 'SRR16676487',
 'SRR16676490',
 'SRR16676492',
 'SRR16676518',
 'SRR16676528',
 'SRR16676529',
 'SRR16676531',
 'SRR16676534',
 'SRR16676559',
 'SRR16676572',
 'SRR16676575',
 'SRR16676577',
 'SRR16676579',
 'SRR16676602']

In [80]:
# Calculate average TPM for each control samples for NHB after 4 hours
NHB_ctrl_srr.append("id")
NHB_ctrl_srr
NHB_ctrl_df = all_tpms[NHB_ctrl_srr]
NHB_ctrl_df.index = NHB_ctrl_df['id']
NHB_ctrl_df = NHB_ctrl_df.drop("id",axis=1)
NHB_ctrl_df['ctrl_avg_tpm'] = NHB_ctrl_df.mean(axis=1)
NHB_ctrl_final = NHB_ctrl_df[['ctrl_avg_tpm']]
NHB_ctrl_final

Unnamed: 0_level_0,ctrl_avg_tpm
id,Unnamed: 1_level_1
ENSG00000223972,0.011533
ENSG00000227232,0.741200
ENSG00000278267,0.000000
ENSG00000243485,0.004133
ENSG00000274890,0.000000
...,...
ENSG00000275028,0.450733
ENSG00000278806,0.000000
ENSG00000274152,0.000000
ENSG00000276666,0.000000


In [71]:
# Get a list of influenza A virus samples for NHB patient that were collected at 4 hours after infection
NHB_inf_srr = sub_mdata[(sub_mdata['Condition'] == "NHB") & (sub_mdata['infection'] == "influenza A virus") & (sub_mdata['Time_point'] == "4 hr")]['Run'].to_list()
NHB_inf_srr

['SRR16676479',
 'SRR16676480',
 'SRR16676484',
 'SRR16676520',
 'SRR16676522',
 'SRR16676523',
 'SRR16676526',
 'SRR16676563',
 'SRR16676569',
 'SRR16676571',
 'SRR16676483',
 'SRR16676562']

In [81]:
# Get dataframe with only the infected samples
NHB_inf_srr.append("id")
NHB_inf_srr
NHB_inf_df = all_tpms[NHB_inf_srr]
NHB_inf_df.index = NHB_inf_df['id']
NHB_inf_df = NHB_inf_df.drop("id",axis=1)
NHB_inf_df

Unnamed: 0,SRR16676479,SRR16676480,SRR16676484,SRR16676520,SRR16676522,SRR16676523,SRR16676526,SRR16676563,SRR16676569,SRR16676571,SRR16676483,SRR16676562
"(ENSG00000223972, ENSG00000223972)",0.000,0.000,0.058,0.000,0.101,0.000,0.000,0.000,0.343,0.000,0.000,0.000
"(ENSG00000227232, ENSG00000227232)",1.083,1.001,0.614,0.824,1.511,0.762,0.988,0.405,0.829,0.771,0.962,0.255
"(ENSG00000278267, ENSG00000278267)",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
"(ENSG00000243485, ENSG00000243485)",0.000,0.000,0.094,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
"(ENSG00000274890, ENSG00000274890)",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...
"(ENSG00000275028, ENSG00000275028)",0.000,0.961,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
"(ENSG00000278806, ENSG00000278806)",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
"(ENSG00000274152, ENSG00000274152)",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
"(ENSG00000276666, ENSG00000276666)",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [75]:
NHB_inf_df.merge(NHB_ctrl_df, how="outer", left_index=True, right_index=True)
NHB_diff = NHB_inf_df.sub(NHB_ctrl_df['ctrl_avg_tpm'], axis=0)
NHB_diff
NHB_diff.to_csv("PRJNA776746_NHB_InfA_4hr.tsv", sep="\t")

Unnamed: 0_level_0,SRR16676479,SRR16676480,SRR16676484,SRR16676520,SRR16676522,SRR16676523,SRR16676526,SRR16676563,SRR16676569,SRR16676571,SRR16676483,SRR16676562
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000223972,-0.011533,-0.011533,0.046467,-0.011533,0.089467,-0.011533,-0.011533,-0.011533,0.331467,-0.011533,-0.011533,-0.011533
ENSG00000227232,0.341800,0.259800,-0.127200,0.082800,0.769800,0.020800,0.246800,-0.336200,0.087800,0.029800,0.220800,-0.486200
ENSG00000278267,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSG00000243485,-0.004133,-0.004133,0.089867,-0.004133,-0.004133,-0.004133,-0.004133,-0.004133,-0.004133,-0.004133,-0.004133,-0.004133
ENSG00000274890,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000275028,-0.450733,0.510267,-0.450733,-0.450733,-0.450733,-0.450733,-0.450733,-0.450733,-0.450733,-0.450733,-0.450733,-0.450733
ENSG00000278806,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSG00000274152,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSG00000276666,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
