In [2]:
import pandas as pd # for most table operaions
import glob # for getting list of all files in a folder
import os

In [3]:
def read_tpms(tpm_fldr):
    """Read all the tpm tsv files from a folder, removes rows with 0 TPMs"""
    tpm_tbls = glob.glob(os.path.join(tpm_fldr,"*.tsv")) # get list of all processed files
    fs = list()
    all_tpms = pd.DataFrame(columns=["id"]) # Read all the files and create on TPM table
    for f in tpm_tbls:
        samp_name = f.split("/")[-1].split("_")[0]
        data = pd.read_csv(f, sep="\t", names=["id", samp_name], header=0)
        all_tpms = all_tpms.merge(data, how="outer", left_on="id", right_on="id")
    all_tpms.index = all_tpms['id']
    all_tpms['tpm_sum'] = all_tpms.sum(axis=1)
    all_tpms = all_tpms[all_tpms['tpm_sum'] > 0]
    all_tpms = all_tpms.drop(['id','tpm_sum'], axis=1)
    print("Number of genes processed = " + str (all_tpms.shape[0]))
    print("Number of SRA = " + str (all_tpms.shape[1]))
    return all_tpms


## PRJNA776746

In [4]:
PRJNA776746_tpms = os.path.join("..", "PRJNA776746", "tpms" )
tpm776746df = read_tpms(os.path.join("..", "PRJNA776746", "tpms"))

# Read in the metadata table
mfile776746 = os.path.join("..", "PRJNA776746", "SraRunTable.csv" )
mdata776746 = pd.read_csv(mfile776746, sep=",")
mdata776746.head()


# Get the list of all control samples collected at 4 hour
ctrl776746_4hr = mdata776746[(mdata776746['Time_point'] == "4 hr") & (mdata776746['infection'] == "control")]['Run'].to_list()
print("Number of control samples collected at 4 hours = " + str(len(ctrl776746_4hr)))
# Get the list of all control samples collected at 18 hour
ctrl776746_18hr = mdata776746[(mdata776746['Time_point'] == "18 hr") & (mdata776746['infection'] == "control")]['Run'].to_list()
print("Number of control samples collected at 18 hours = " + str(len(ctrl776746_18hr)))

# Calculate average TPM for each control samples for NHB after 4 hours
ctrl776746_4hr_df = tpm776746df[ctrl776746_4hr]
ctrl776746_18hr_df = tpm776746df[ctrl776746_18hr]

ctrl776746_4hr_df['ctrl_avg_tpm'] = ctrl776746_4hr_df.mean(axis=1) + 0.001
ctrl776746_18hr_df['ctrl_avg_tpm'] = ctrl776746_18hr_df.mean(axis=1) + 0.001

# Get a list of influenza A virus samples for NHB patient that were collected at 4 hours after infection
Influ776746_4h = mdata776746[(mdata776746['infection'] == "influenza A virus") & (mdata776746['Time_point'] == "4 hr")]['Run'].to_list()
Influ776746_18h = mdata776746[(mdata776746['infection'] == "influenza A virus") & (mdata776746['Time_point'] == "18 hr")]['Run'].to_list()
print("Number of samples at 4 hour after infection with Influenza = " + str(len(Influ776746_4h)))
print("Number of samples at 16 hour after infection with Influenza = " + str(len(Influ776746_18h)))


Influ_4h_df = tpm776746df[Influ776746_4h]
Influ_18h_df = tpm776746df[Influ776746_18h]

Influ776746_18h_no0 = Influ_18h_df + 0.001
Influ776746_4h_no0 = Influ_4h_df + 0.001

Influ776746_4h_ratio = Influ776746_4h_no0.truediv(ctrl776746_4hr_df['ctrl_avg_tpm'], axis=0)
Influ776746_18h_ratio = Influ776746_18h_no0.truediv(ctrl776746_18hr_df['ctrl_avg_tpm'], axis=0)


PRJNA776746_df = Influ776746_4h_ratio.merge(Influ776746_18h_ratio, how="outer", left_index=True, right_index=True)

PRJNA776746_df.head()

Number of genes processed = 36384
Number of SRA = 130
Number of control samples collected at 4 hours = 35
Number of control samples collected at 18 hours = 32
Number of samples at 4 hour after infection with Influenza = 32
Number of samples at 16 hour after infection with Influenza = 31


  all_tpms['tpm_sum'] = all_tpms.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctrl776746_4hr_df['ctrl_avg_tpm'] = ctrl776746_4hr_df.mean(axis=1) + 0.001
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctrl776746_18hr_df['ctrl_avg_tpm'] = ctrl776746_18hr_df.mean(axis=1) + 0.001


Unnamed: 0_level_0,SRR16676479,SRR16676480,SRR16676481,SRR16676484,SRR16676485,SRR16676486,SRR16676520,SRR16676521,SRR16676522,SRR16676523,...,SRR16676581,SRR16676582,SRR16676583,SRR16676584,SRR16676585,SRR16676496,SRR16676499,SRR16676453,SRR16676461,SRR16676494
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000223972,0.04,0.04,0.04,2.36,1.36,0.04,0.04,0.04,4.08,0.04,...,0.450704,0.450704,0.450704,0.450704,0.450704,24.338028,0.450704,37.408451,0.450704,0.450704
ENSG00000227232,1.509689,1.395488,0.821694,0.856512,1.043134,1.55147,1.148979,1.44284,2.105766,1.062632,...,0.392434,0.314687,0.749696,0.351709,0.945913,1.188407,0.449818,1.432753,0.908891,0.022213
ENSG00000243485,0.227273,0.227273,0.227273,21.590909,0.227273,0.227273,0.227273,0.227273,0.227273,0.227273,...,1.0,36.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ENSG00000237613,0.192308,0.192308,0.192308,0.192308,0.192308,0.192308,0.192308,0.192308,0.192308,0.192308,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ENSG00000240361,0.195531,0.195531,0.195531,30.698324,0.195531,0.195531,0.195531,0.195531,0.195531,0.195531,...,0.090395,0.090395,0.090395,0.090395,0.090395,0.090395,0.090395,20.248588,0.090395,0.090395


## PRJNA791847

In [5]:
PRJNA791847_tpms = os.path.join("..", "PRJNA791847", "tpms" )
tpm791847df = read_tpms(os.path.join("..", "PRJNA791847", "tpms"))

# Read in the metadata table
mfile791847 = os.path.join("..", "PRJNA791847", "SraRunTable.csv" )
mdata791847 = pd.read_csv(mfile791847, sep=",")
mdata791847.columns


# # Get the list of all control samples 
ctrl791847_A19 = mdata791847[(mdata791847['Patient_number'] == "A19") & (mdata791847['infection_group'].isin(["Uninfected", "Influenza A virus (IAV)"]))]['Run'].to_list()
ctrl791847_A19
ctrl791847_A20 = mdata791847[(mdata791847['Patient_number'] == "A20") & (mdata791847['infection_group'].isin(["Uninfected", "Influenza A virus (IAV)"]))]['Run'].to_list()
ctrl791847_A20
ctrl791847_A21 = mdata791847[(mdata791847['Patient_number'] == "A21") & (mdata791847['infection_group'].isin(["Uninfected", "Influenza A virus (IAV)"]))]['Run'].to_list()
ctrl791847_A21
ctrl791847_A22 = mdata791847[(mdata791847['Patient_number'] == "A22") & (mdata791847['infection_group'].isin(["Uninfected", "Influenza A virus (IAV)"]))]['Run'].to_list()
ctrl791847_A22
ctrl791847_A23 = mdata791847[(mdata791847['Patient_number'] == "A23") & (mdata791847['infection_group'].isin(["Uninfected", "Influenza A virus (IAV)"]))]['Run'].to_list()
ctrl791847_A23


def calc_ratio(samp_df, samp_list, mdata, pat_name):
    df = samp_df[samp_list]
    df['sum'] = df.sum(axis=1)
    df = df[df['sum'] > 0 ]
    df = df.drop(['sum'],axis=1)
    df = df + 0.001
    inf_id = mdata[(mdata['Patient_number'] == pat_name) & (mdata['infection_group'] == "Influenza A virus (IAV)")]['Run'].to_list()[0]
    ctrl_id = mdata[(mdata['Patient_number'] == pat_name) & (mdata['infection_group'] == "Uninfected")]['Run'].to_list()[0]
    df_ratio = pd.DataFrame()
    df_ratio[inf_id] = df[inf_id]/df[ctrl_id]
    return df_ratio

A19 = calc_ratio(tpm791847df, ctrl791847_A19, mdata791847, "A19")
A20 = calc_ratio(tpm791847df, ctrl791847_A20, mdata791847,"A20")
A21 = calc_ratio(tpm791847df, ctrl791847_A21, mdata791847,"A21")
A22 = calc_ratio(tpm791847df, ctrl791847_A22, mdata791847,"A22")
A23 = calc_ratio(tpm791847df, ctrl791847_A23, mdata791847,"A23")

A23
A19_20 = A19.merge(A20, how="outer", left_index=True, right_index=True)
A19_20_21 = A19_20.merge(A21, how="outer", left_index=True, right_index=True)
A19_20_21_22 = A19_20_21.merge(A22, how="outer", left_index=True, right_index=True)
PRJNA791847_df = A19_20_21_22.merge(A23, how="outer", left_index=True, right_index=True)

PRJNA791847_df

Number of genes processed = 34277
Number of SRA = 10


  all_tpms['tpm_sum'] = all_tpms.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sum'] = df.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sum'] = df.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sum'] = df.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

Unnamed: 0_level_0,SRR17319833,SRR17319829,SRR17319825,SRR17319821,SRR17319817
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000000003,1.026849,0.976156,0.706399,1.050991,0.571571
ENSG00000000005,3.849558,0.662500,0.370213,0.414414,2.906667
ENSG00000000419,1.262767,1.440034,0.854995,1.096569,0.880698
ENSG00000000457,0.747597,1.182103,0.971384,1.620637,1.430502
ENSG00000000460,0.772554,0.945873,0.915416,1.462006,1.058366
...,...,...,...,...,...
ENSG00000283098,1.111842,0.771357,1.186667,1.491228,0.441489
ENSG00000283103,0.952542,0.616642,1.157315,0.976667,1.264329
ENSG00000283108,0.709172,2.805461,1.630233,2.037383,0.689507
ENSG00000283117,3.636364,0.766520,1.648649,0.740175,1.121212


## PRJNA809199

In [6]:
PRJNA809199_tpms = os.path.join("..", "PRJNA809199", "tpms" )
tpm809199df = read_tpms(os.path.join("..", "PRJNA809199", "tpms"))

# Read in the metadata table
mfile809199 = os.path.join("..", "PRJNA809199", "SraRunTable.csv" )
mdata809199 = pd.read_csv(mfile809199, sep=",")
mdata809199.columns
mdata809199

# get accesion ID of mock/control samples
mock_runs = mdata809199[mdata809199['infection'] == "mock"]['Run'].to_list()
mock_runs

# get accesion ID of mock/control samples
infu_runs = mdata809199[mdata809199['infection'] != "mock"]['Run'].to_list()
infu_runs


# get df of TPMs for control samples and average value
mock_tpms = tpm809199df[mock_runs]
mock_tpms['mean'] = mock_tpms.mean(axis=1)
mock_tpms = mock_tpms + 0.001

# get df of infected samples
infu_tpms = tpm809199df[infu_runs]
infu_tpms = infu_tpms + 0.001
infu_tpms

# divide by mean
PRJNA809199_df =  infu_tpms.truediv(mock_tpms['mean'], axis=0)
PRJNA809199_df


Number of genes processed = 32315
Number of SRA = 15


  all_tpms['tpm_sum'] = all_tpms.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mock_tpms['mean'] = mock_tpms.mean(axis=1)


Unnamed: 0_level_0,SRR18097076,SRR18097077,SRR18097079,SRR18097080,SRR18097081,SRR18097083,SRR18097084,SRR18097085,SRR18097086,SRR18097088,SRR18097074,SRR18097075
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000223972,0.501672,3.672241,0.010033,0.010033,0.361204,2.849498,0.010033,0.401338,0.010033,0.501672,1.294314,0.010033
ENSG00000227232,0.835458,0.891956,0.877740,0.870127,0.858846,0.955242,0.948271,1.026231,0.810969,0.736586,1.159773,1.000642
ENSG00000243485,0.040541,0.040541,0.040541,0.040541,0.040541,0.040541,0.040541,2.635135,0.040541,0.040541,0.040541,0.040541
ENSG00000237613,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,72.000000,1.000000,1.000000,1.000000,1.000000,68.000000
ENSG00000238009,1.188679,0.754717,1.188679,0.792453,0.773585,0.452830,0.566038,0.735849,0.830189,0.867925,0.754717,0.981132
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000271254,0.640000,0.596800,1.355200,0.739600,1.159600,0.825200,1.149600,0.750800,1.183200,0.977200,0.846400,0.563600
ENSG00000275405,2.564624,3.791592,0.541369,3.931798,42.251565,0.000671,0.446109,2.038685,2.977862,17.885286,1.778399,5.054785
ENSG00000275987,0.002068,0.002068,1.900069,1.381116,0.002068,0.002068,0.002068,5.725017,0.002068,0.002068,3.121985,0.002068
ENSG00000277475,0.728296,0.449357,1.151929,0.186495,0.261254,0.397910,0.632637,0.290193,0.352894,0.242765,0.210611,0.200161


## PRJNA849574

In [7]:
PRJNA849574_tpms = os.path.join("..", "PRJNA849574", "tpms" )
tpm849574df = read_tpms(os.path.join("..", "PRJNA849574", "tpms"))

# Read in the metadata table
mfile849574 = os.path.join("..", "PRJNA849574", "SraRunTable.csv" )
mdata849574 = pd.read_csv(mfile849574, sep=",")
mdata849574.columns
mdata849574


mock_list = mdata849574[(mdata849574['Genotype'] == "WT" ) & (mdata849574['virus_infection'] == "Mock")]['Run'].to_list()
mock_list

inf_list = mdata849574[(mdata849574['Genotype'] == "WT" ) & (mdata849574['virus_infection'] == "PR8")]['Run'].to_list()
inf_list

mock_df = tpm849574df[mock_list]
mock_df['mean'] = mock_df.mean(axis=1)
mock_df = mock_df + 0.001
mock_df


# get df of infected samples
infu_tpms = tpm849574df[inf_list]
infu_tpms = infu_tpms + 0.001
infu_tpms

# divide by mean
PRJNA849574_df =  infu_tpms.truediv(mock_df['mean'], axis=0)
PRJNA849574_df



Number of genes processed = 26973
Number of SRA = 12


  all_tpms['tpm_sum'] = all_tpms.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mock_df['mean'] = mock_df.mean(axis=1)


Unnamed: 0_level_0,SRR19663483,SRR19663484,SRR19663485
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000223972,0.492453,1.420755,1.675472
ENSG00000227232,1.115067,0.982937,0.829370
ENSG00000243485,0.020979,0.020979,0.020979
ENSG00000238009,0.093750,0.375000,1.125000
ENSG00000233750,1.000000,34.000000,20.000000
...,...,...,...
ENSG00000276345,0.134585,0.649120,0.719477
ENSG00000271254,0.876256,0.898891,0.798491
ENSG00000275405,0.000334,1.035587,1.089969
ENSG00000275987,1.158261,2.305217,1.832826


In [8]:
# A PRJNA776746_df.merge(PRJNA791847_df, how="outer", left_index=True, right_index=True)

PRJNA809199_df
PRJNA849574_df

all_df = pd.concat([PRJNA776746_df, PRJNA791847_df,PRJNA809199_df, PRJNA849574_df  ], axis=1)
all_df.to_csv("Nov4_df.tsv", sep="\t")
all_df.shape

(37913, 83)