In [207]:
import pandas as pd
import json
import os
import ipdb
import numpy as np

In [370]:
config_file_path = input()
config_file = open(config_file_path)
config = json.load(config_file)

In [319]:
output_folder = os.path.join(config['output_folder'], config['experiment_name'])
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [320]:
def save_df_to_csv(df, output_file):
    # create the folder if not exists yet
    if not os.path.exists(os.path.dirname(output_file)):
        os.makedirs(os.path.dirname(output_file))

    # Write DataFrame to csv file
    df.to_csv(output_file, index=False)

In [321]:
def read_experimental_dfs(config):
    dfs = []

    for file in config["files"]:
        df = pd.read_csv(file)
        df = df.rename(columns={df.columns[0]: "unique_17"})
        dfs.append(df)

    return dfs

In [322]:
lib_csv_path = config["lib_csv_path"]
lib_df = pd.read_csv(lib_csv_path)

In [371]:
dfs = read_experimental_dfs(config)

In [362]:
dfs[0].head()

Unnamed: 0,unique_17,CD4_mean,TREGS_mean,M,D,prob,ranking
0,AGTTATCTGGACACGTT,6.468266,262.079474,-5.340482,255.611209,0.907912,-255.666992
1,ACCCGTAGACTCGGACA,0.054816,120.362425,-11.100505,120.307609,0.902661,-120.818633
2,CCCGGAACTTATGATTT,0.054816,116.778432,-11.056894,116.723617,0.899031,-117.246141
3,AAAGTTGGGCAGATATG,29.600537,488.617687,-4.045011,459.017149,0.898235,-459.034972
4,TAATAGCCTACGTACAC,12.607636,298.666068,-4.566164,286.058431,0.895149,-286.094872


In [363]:
dfs[0].shape[0]

1152

In [327]:
def run_experiment_analysis_list(dfs):
    shared_de = dfs[0]['unique_17'].copy()
    for i in range(1, len(dfs)):
        df = dfs[i]
        shared_de = pd.Series(list(set(shared_de) & set(df['unique_17'])))

    return shared_de

In [328]:
def run_experiment_analysis_df(dfs):
    shared_df = dfs[0][['unique_17','prob']].copy()
    for i in range(1, len(dfs)):
        df = dfs[i]
        shared_df = pd.merge(shared_df, df[['unique_17','prob']], "inner", on="unique_17", suffixes=("","_y"))
        shared_df['prob'] = shared_df['prob'] + shared_df['prob_y']
        shared_df = shared_df.drop(columns=['prob_y'])

    shared_df['prob'] = shared_df['prob']/len(dfs)

    shared_df = shared_df.sort_values(by="prob", ascending=False).reset_index(drop=True)
    return shared_df

In [372]:
overall_de = run_experiment_analysis_df(dfs)

In [373]:
overall_de

Unnamed: 0,unique_17,prob
0,TTTGCGTCTTTGTCGAT,0.772593
1,CACGTTTGCGAACCCTC,0.770128
2,ACGATAGCGCACAAGAG,0.757474
3,AATGTGGCGGGCAGGAT,0.753125
4,AAGTGCACCTCAGTTTA,0.740745
5,GCTTCTGGATTCGTTTG,0.73541
6,TCGCCGTTCTCAACCTC,0.733402
7,AATGGGTGTTGACACCT,0.727374
8,ACTGATGCAGTCCTGCA,0.725756
9,CCGGATAGTAGCATCCC,0.725592


In [178]:
def reads_per_million_norm(df, scalling=1e6):
    norm_df = df.copy()
    for column in norm_df:
        total_read_num = df[column].sum()
        scalling_factor = total_read_num / scalling
        norm_df[column] = norm_df[column] / scalling_factor

    return norm_df

In [297]:
count_data = pd.read_csv(config["count_data_file"]).set_index("unique_17")

In [298]:
norm_cout_data = reads_per_million_norm(count_data)

In [299]:
norm_cout_data.head()

Unnamed: 0_level_0,H7_R1_bio_1,H7_R1_bio_1_tech_2,H7_R1_bio_1_tech_3,RL_R1_bio_1,RL_R1_bio_1_tech_2,RL_R1_bio_1_tech_3,DS_R1_bio_1,DS_R1_bio_1_tech_2,DS_R1_bio_1_tech_3,U2_R1_bio_1,...,B-CELLS_R2_bio_1_tech_3,MONOCYTES_R2_bio_1,MONOCYTES_R2_bio_1_tech_2,MONOCYTES_R2_bio_1_tech_3,JURKAT_R2_bio_1,JURKAT_R2_bio_1_tech_2,JURKAT_R2_bio_1_tech_3,H1_R2_bio_1,H1_R2_bio_1_tech_2,H1_R2_bio_1_tech_3
unique_17,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACTACCTGAAGAACCTT,0.0,0.0,0.0,0.464763,0.0,1.635681,0.0,0.0,0.0,0.0,...,27.100694,40.989127,90.048938,0.0,3.940888,7.370367,1.568671,0.257135,0.681808,0.0
GAGCTAAATGGCTGATT,93.401123,100.849802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,27.738358,0.0,0.0,0.0,0.927268,1.133903,0.784336,0.514271,0.0,0.825657
GTGACCACACTTACAGT,0.0,0.0,0.0,0.0,0.0,0.0,2.857099,4.095608,0.0,0.0,...,34.752655,3.328177,7.311666,0.0,9.736311,10.772074,9.019859,11.571092,5.454466,15.274651
TTGTTGGCGAGCAGTGT,0.0,0.0,0.0,0.0,0.0,0.0,3.673412,3.510521,4.049184,0.0,...,0.637663,0.0,0.0,0.0,4.636338,3.968659,5.098181,0.0,0.0,0.0
CAATATCGGCGAGCTCT,1.762285,0.0,23.860084,31.603888,31.163408,32.713628,16.326277,16.382431,16.196736,0.0,...,19.767565,0.350334,0.769649,0.0,39.87251,45.923053,35.687268,31.113381,32.726797,30.136473


In [374]:
df = overall_de.copy()
df = df.set_index("unique_17")

In [375]:
df = df.join(count_data[["CD4_R1_bio_1_tech_2", "CD4_R1_bio_1_tech_3", "CD4_R2_bio_1_tech_2", "CD4_R2_bio_1_tech_3", "CD8_R1_bio_1_tech_2", "CD8_R1_bio_1_tech_3", "CD8_R2_bio_1_tech_2", "CD8_R2_bio_1_tech_3"]])
df = df.join(norm_cout_data[["CD4_R1_bio_1_tech_2", "CD4_R1_bio_1_tech_3", "CD4_R2_bio_1_tech_2", "CD4_R2_bio_1_tech_3", "CD8_R1_bio_1_tech_2", "CD8_R1_bio_1_tech_3", "CD8_R2_bio_1_tech_2", "CD8_R2_bio_1_tech_3"]], rsuffix="_cpm_norm")
df = df.join(lib_df[["unique_17", "TF", "Sequence", "ID"]].set_index("unique_17"))

tfs = df.pop("TF")
seqs = df.pop("Sequence")
ids = df.pop("ID")
df.insert(1, "TF", tfs)
df.insert(2, "Sequence", seqs)
df.insert(3, "ID", ids)

In [376]:
df = df.reset_index()

In [377]:
df.head()

Unnamed: 0,unique_17,prob,TF,Sequence,ID,CD4_R1_bio_1_tech_2,CD4_R1_bio_1_tech_3,CD4_R2_bio_1_tech_2,CD4_R2_bio_1_tech_3,CD8_R1_bio_1_tech_2,...,CD8_R2_bio_1_tech_2,CD8_R2_bio_1_tech_3,CD4_R1_bio_1_tech_2_cpm_norm,CD4_R1_bio_1_tech_3_cpm_norm,CD4_R2_bio_1_tech_2_cpm_norm,CD4_R2_bio_1_tech_3_cpm_norm,CD8_R1_bio_1_tech_2_cpm_norm,CD8_R1_bio_1_tech_3_cpm_norm,CD8_R2_bio_1_tech_2_cpm_norm,CD8_R2_bio_1_tech_3_cpm_norm
0,TTTGCGTCTTTGTCGAT,0.772593,ZSCAN4|Hs;From_Old2506;From_Old5605;,TTTTCAGTGTGTGCA,D6M_5902,101.0,1.0,98.0,4.0,207.0,...,199.0,87.0,48.830297,0.986406,48.752387,0.99277,99.63851,18.6622,87.77331,24.885655
1,CACGTTTGCGAACCCTC,0.770128,ZNF547|Hs;,TGCTAATGCAGCAGGCATAC,D6M_5200,85.0,0.0,80.0,0.0,122.0,...,136.0,25.0,41.094804,0.0,39.797867,0.0,58.724146,6.998325,59.98578,7.15105
2,ACGATAGCGCACAAGAG,0.757474,RELB|Hs;Relb|Mm;From_Old1949;From_Old5048;,GGGAAATCCCCC,D6M_3733,807.0,343.0,789.0,1440.0,794.0,...,911.0,2439.0,390.158906,338.337373,392.50646,357.397343,382.188293,738.323295,401.816511,697.656458
3,AATGTGGCGGGCAGGAT,0.753125,BARHL1|Hs;,TTAAGAGCATTTA,D6M_5424,0.0,0.0,0.0,0.0,21.0,...,32.0,0.0,0.0,0.0,0.0,0.0,10.108255,0.0,14.114301,0.0
4,AAGTGCACCTCAGTTTA,0.740745,HOXC10|Hs;From_Old0311;From_Old3410;,TTTTTATGGG,D6M_5940,81.0,0.0,97.0,0.0,258.0,...,228.0,15.0,39.160931,0.0,48.254913,0.0,124.187128,2.332775,100.564396,4.29063


In [378]:
final_result_file = os.path.join(output_folder, config["output_DEP_file_name"])
save_df_to_csv(df, final_result_file)

In [None]:
# T - 0.8
# 0    CTTAGTGCCGCCGTTTT
# 1    TAACCGAAACCCAACCG
# 2    AAAGAGCTGGGGGCATT
# 3    CAACCCCGGACCCATAT
# 4    CTCTCTTGTCCCGCGAT
# 5    CCTCTTAACCTCAGCCC
# 6    AGGGTCATAACACCCAG

In [None]:
# T - 0.7
# 0     CTTAGTGCCGCCGTTTT
# 1     GCTTAGGAGCACTTGTA
# 2     TACGCGATGTCAGAGCT
# 3     TAACCGAAACCCAACCG
# 4     CTCTCTTGTCCCGCGAT
# 5     AAAGAGCTGGGGGCATT
# 6     CAACCCCGGACCCATAT
# 7     CCTCTTAACCTCAGCCC
# 8     GGAATCCTGATAACAGG
# 9     AGGGTCATAACACCCAG
# 10    AACAACAAATAGAGCCC

In [None]:
# T - 0.6
# 0     CTTAGTGCCGCCGTTTT
# 1     GCTTAGGAGCACTTGTA
# 2     TACGCGATGTCAGAGCT
# 3     ATGATCACGTAGCCTAT
# 4     TAACCGAAACCCAACCG
# 5     CAGATCCGGCAGGAAGA
# 6     CTCTCTTGTCCCGCGAT
# 7     TCGTGCTTAAACTCCTG
# 8     AAAGAGCTGGGGGCATT
# 9     GGAGTGGCTAAATCTAA
# 10    CAACCCCGGACCCATAT
# 11    CGTCCTGTGATCAAGGG
# 12    ACGTCTGGCATGCAACC
# 13    AGTCCAGGTTTTATCTC
# 14    CCTCTTAACCTCAGCCC
# 15    GGAATCCTGATAACAGG
# 16    AGGGTCATAACACCCAG
# 17    AACAACAAATAGAGCCC

In [None]:
# T - 0.55
# 0     TAACCGAAACCCAACCG
# 1     CAGATCCGGCAGGAAGA
# 2     GGTTTTCTGCTGACTAA
# 3     GAAGCTCTGTGAGATTT
# 4     AGTCCAGGTTTTATCTC
# 5     AAACCCTGTATGCCAAG
# 6     GCTTAGGAGCACTTGTA
# 7     TACGCGATGTCAGAGCT
# 8     CTCTCTTGTCCCGCGAT
# 9     GGAGTGGCTAAATCTAA
# 10    TTCGCAATGCTCTTCGG
# 11    AACGATCTGCCGACATA
# 12    AGGGTCATAACACCCAG
# 13    ATGATCACGTAGCCTAT
# 14    TCGTGCTTAAACTCCTG
# 15    AAAGAGCTGGGGGCATT
# 16    ACGTCTGGCATGCAACC
# 17    CCTCTTAACCTCAGCCC
# 18    CATCAGAGTGTTTGATC
# 19    GGAATCCTGATAACAGG
# 20    TGACACTCGAGCTATAC
# 21    CTTAGTGCCGCCGTTTT
# 22    CAGGAGCCGGAACTTAG
# 23    CAACCCCGGACCCATAT
# 24    CGTCCTGTGATCAAGGG
# 25    AACAACAAATAGAGCCC