## Post processing TPM tables to obtain a delta table.
### October 30, 2022


In [1]:
import pandas as pd # for most table operaions
import glob # for getting list of all files in a folder
import os

### Reading multiple TPM tables to get one dataframe

In [2]:
# get list of all processed files
tpm_tbls = glob.glob(os.path.join("..","tpms","*.tsv"))
len(tpm_tbls)

10

In [3]:
# Read all the files and create on TPM table
dfs = list()
all_tpms = pd.DataFrame(columns=["id"])
for f in tpm_tbls:
    samp_name = f.split("/")[-1].split("_")[0]
    data = pd.read_csv(f, sep="\t", names=["id", samp_name], header=0)
    all_tpms = all_tpms.merge(data, how="outer", left_on="id", right_on="id")

all_tpms.index = all_tpms['id']
all_tpms = all_tpms.drop('id',axis=1)
all_tpms


Unnamed: 0_level_0,SRR17319825,SRR17319824,SRR17319829,SRR17319828,SRR17319817,SRR17319833,SRR17319821,SRR17319816,SRR17319820,SRR17319832
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSG00000223972,0.000,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.00,0.000
ENSG00000227232,1.075,1.48,2.472,1.845,0.723,1.271,0.961,1.684,0.91,1.939
ENSG00000278267,0.000,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.00,0.000
ENSG00000243485,0.000,0.00,0.000,0.170,0.000,0.000,0.000,0.000,0.00,0.000
ENSG00000274890,0.000,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.00,0.000
...,...,...,...,...,...,...,...,...,...,...
ENSG00000275028,0.000,0.00,0.000,0.000,0.000,0.000,5.339,0.000,0.00,0.000
ENSG00000278806,0.000,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.00,0.000
ENSG00000274152,0.000,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.00,0.000
ENSG00000276666,0.000,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.00,0.000


In [4]:
print("Number of genes processed = " + str (all_tpms.shape[0]))
print("Number of SRA = " + str (all_tpms.shape[1]))

Number of genes processed = 60675
Number of SRA = 10


In [5]:
# Read in the metadata table
sra_mdata = pd.read_csv("../SraRunTable.csv", sep=",")
sra_mdata.columns
# Subset pertinent columns only
# sub_mdata = sra_mdata[['Donor', 'Run','Experiment','BioSample', 'Condition', 'infection','Time_point']]
sub_mdata = sra_mdata[['source_name', 'Run','Experiment','BioSample', 'Patient_number', 'infection_group']]
sub_mdata.sort_values('Patient_number')


Unnamed: 0,source_name,Run,Experiment,BioSample,Patient_number,infection_group
16,lung,SRR17319835,SRX13496333,SAMN24365620,A19,Mycobacterium bovis (BCG)
15,lung,SRR17319834,SRX13496334,SAMN24365619,A19,Pseudomonas aeruginosa (PA)
14,lung,SRR17319833,SRX13496335,SAMN24365618,A19,Influenza A virus (IAV)
13,lung,SRR17319832,SRX13496336,SAMN24365617,A19,Uninfected
9,lung,SRR17319828,SRX13496340,SAMN24365613,A20,Uninfected
12,lung,SRR17319831,SRX13496337,SAMN24365616,A20,Mycobacterium bovis (BCG)
11,lung,SRR17319830,SRX13496338,SAMN24365615,A20,Pseudomonas aeruginosa (PA)
10,lung,SRR17319829,SRX13496339,SAMN24365614,A20,Influenza A virus (IAV)
19,lung,SRR17319824,SRX13496344,SAMN24365609,A21,Uninfected
7,lung,SRR17319826,SRX13496342,SAMN24365611,A21,Pseudomonas aeruginosa (PA)


In [6]:
# Subset samples that are uninfected and influenza
Influ_only = sub_mdata[(sub_mdata['infection_group'].isin(['Uninfected', 'Influenza A virus (IAV)']))]
Influ_only
# sub_mdata['infection_group']

Unnamed: 0,source_name,Run,Experiment,BioSample,Patient_number,infection_group
0,lung,SRR17319817,SRX13496351,SAMN24365602,A23,Influenza A virus (IAV)
2,lung,SRR17319820,SRX13496348,SAMN24365605,A22,Uninfected
3,lung,SRR17319821,SRX13496347,SAMN24365606,A22,Influenza A virus (IAV)
6,lung,SRR17319825,SRX13496343,SAMN24365610,A21,Influenza A virus (IAV)
9,lung,SRR17319828,SRX13496340,SAMN24365613,A20,Uninfected
10,lung,SRR17319829,SRX13496339,SAMN24365614,A20,Influenza A virus (IAV)
13,lung,SRR17319832,SRX13496336,SAMN24365617,A19,Uninfected
14,lung,SRR17319833,SRX13496335,SAMN24365618,A19,Influenza A virus (IAV)
17,lung,SRR17319816,SRX13496352,SAMN24365601,A23,Uninfected
19,lung,SRR17319824,SRX13496344,SAMN24365609,A21,Uninfected


In [7]:
all_tpms['sum'] = all_tpms.sum(axis=1)
non0_tpms = all_tpms[all_tpms['sum']> 0]
non0_tpms


# Get ratios Influenza/control

# Influ_4h_ratio = Influ_4h_no0.truediv(ctrl_4hr_no0['ctrl_avg_tpm'], axis=0)
# Influ_18h_ratio = Influ_18h_df.truediv(ctrl_18hr_no0['ctrl_avg_tpm'], axis=0)
# Influ_18h_ratio

Unnamed: 0_level_0,SRR17319825,SRR17319824,SRR17319829,SRR17319828,SRR17319817,SRR17319833,SRR17319821,SRR17319816,SRR17319820,SRR17319832,sum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000227232,1.075,1.480,2.472,1.845,0.723,1.271,0.961,1.684,0.910,1.939,14.360
ENSG00000243485,0.000,0.000,0.000,0.170,0.000,0.000,0.000,0.000,0.000,0.000,0.170
ENSG00000240361,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.360,0.360
ENSG00000238009,0.032,0.012,0.009,0.039,0.016,0.037,0.013,0.003,0.024,0.038,0.223
ENSG00000233750,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.031,0.044,0.075
...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000271254,1.278,1.244,3.174,4.945,2.203,2.775,2.855,5.490,3.606,4.939,32.509
ENSG00000275405,6.313,3.234,3.969,2.415,1.736,3.073,6.837,15.318,5.806,3.092,51.793
ENSG00000275987,0.000,0.000,0.000,1.834,0.000,0.000,0.000,2.908,0.000,0.000,4.742
ENSG00000277475,0.000,0.000,0.000,0.000,1.028,0.152,0.000,0.840,0.000,2.237,4.257


In [8]:
A21_srr = Influ_only[Influ_only['Patient_number'] == "A21"]['Run'].to_list()
A19_srr = Influ_only[Influ_only['Patient_number'] == "A19"]['Run'].to_list()
A20_srr = Influ_only[Influ_only['Patient_number'] == "A20"]['Run'].to_list()
A22_srr = Influ_only[Influ_only['Patient_number'] == "A22"]['Run'].to_list()
A23_srr = Influ_only[Influ_only['Patient_number'] == "A23"]['Run'].to_list()

In [9]:
A19 = non0_tpms[A19_srr]
A20 = non0_tpms[A20_srr]
A21 = non0_tpms[A21_srr]
A22 = non0_tpms[A22_srr]
A23 = non0_tpms[A23_srr]

A19['sum'] = A19.sum(axis=1)
A19= A19[A19['sum'] > 0 ]

A20['sum'] = A20.sum(axis=1)
A20 = A20[A20['sum'] > 0 ]

A21['sum'] = A21.sum(axis=1)
A21 = A21[A21['sum'] > 0 ]

A22['sum'] = A22.sum(axis=1)
A22 = A22[A22['sum'] > 0 ]

A23['sum'] = A23.sum(axis=1)
A23 = A23[A23['sum'] > 0 ]

A23


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A19['sum'] = A19.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A20['sum'] = A20.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A21['sum'] = A21.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

Unnamed: 0_level_0,SRR17319817,SRR17319816,sum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000227232,0.723,1.684,2.407
ENSG00000238009,0.016,0.003,0.019
ENSG00000268903,1.886,0.555,2.441
ENSG00000269981,1.003,2.457,3.460
ENSG00000241860,0.018,0.009,0.027
...,...,...,...
ENSG00000275063,152.904,202.421,355.325
ENSG00000271254,2.203,5.490,7.693
ENSG00000275405,1.736,15.318,17.054
ENSG00000275987,0.000,2.908,2.908


In [10]:
# Influenza/Uninfected
A19['fc'] = A19['SRR17319833']/A19['SRR17319832']
A20['fc'] = A20['SRR17319829']/A20['SRR17319828']
A21['fc'] = A21['SRR17319825']/A21['SRR17319824']
A22['fc'] = A22['SRR17319821']/A22['SRR17319820']
A23['fc'] = A23['SRR17319817']/A23['SRR17319816']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A19['fc'] = A19['SRR17319833']/A19['SRR17319832']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A20['fc'] = A20['SRR17319829']/A20['SRR17319828']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A21['fc'] = A21['SRR17319825']/A21['SRR17319824']
A value is trying to be set on a copy of a slice from a

In [14]:
A19_srr.append('sum')
A19_fc = A19.drop(A19_srr, axis=1)
A19_fc = A19_fc.rename(columns={'fc':'SRR17319833'})
A19_fc


Unnamed: 0_level_0,SRR17319833
id,Unnamed: 1_level_1
ENSG00000227232,0.655493
ENSG00000240361,0.000000
ENSG00000238009,0.973684
ENSG00000233750,0.000000
ENSG00000268903,1.277601
...,...
ENSG00000277856,1.180270
ENSG00000275063,0.447251
ENSG00000271254,0.561855
ENSG00000275405,0.993855


In [12]:
# A19_fc = A19.drop(A19_srr, axis=1)
# A19_fc
A19_rem_list
A19_srr
A19_rem_list

NameError: name 'A19_rem_list' is not defined

In [None]:
A19.merge(A20, left_index=True, right_index=True, how="outer")