In [None]:
import os
import pandas as pd

Import the GSEA only arm files.

In [None]:
# Network Meta File

GSEA_net_meta_file = "/Users/anb/Documents/CMEB-Lab/ESCC-Drug-Repositioning/DR-escc-data/output/GSE23400/1st-dual-arm-analysis/GSEA-only-arm/network-meta/Meta-human-genes-15-march-redo/xaR2n7BXuo_app2result.txt"

GSEA_net_meta = pd.read_csv(
    GSEA_net_meta_file, comment="#", header=0, sep='\t', low_memory=False
)

# Network Top 500 file

GSEA_net_500_file = "/Users/anb/Documents/CMEB-Lab/ESCC-Drug-Repositioning/DR-escc-data/output/GSE23400/1st-dual-arm-analysis/GSEA-only-arm/network-top500/20j2lKkkjD-Mergeomics-top500-updated.tsv"

GSEA_net_500 = pd.read_csv(
    GSEA_net_500_file, comment="#", header=0, sep='\t', low_memory=False
)

# Overlap file

GSEA_overlap_file = "/Users/anb/Documents/CMEB-Lab/ESCC-Drug-Repositioning/DR-escc-data/output/GSE23400/1st-dual-arm-analysis/GSEA-only-arm/overlap/f5gFthEq8G_app3result.txt"

GSEA_overlap = pd.read_csv(
    GSEA_overlap_file, comment="#", header=0, sep='\t', low_memory=False
)


Import the wKDA arm files.

In [None]:
# wkda Network Meta File

wkda_net_meta_file = "/Users/anb/Documents/CMEB-Lab/ESCC-Drug-Repositioning/DR-escc-data/output/GSE23400/1st-dual-arm-analysis/wKDA-arm/network-meta/Cru23plOJu_app2result.txt"

wkda_net_meta = pd.read_csv(
    wkda_net_meta_file, comment="#", header=0, sep='\t', low_memory=False
)

# Network Top 500 file

wkda_net_500_file = "/Users/anb/Documents/CMEB-Lab/ESCC-Drug-Repositioning/DR-escc-data/output/GSE23400/1st-dual-arm-analysis/wKDA-arm/network-top500/wDlxSsVxjv-wkda-13-03-22-updated.tsv"

wkda_net_500 = pd.read_csv(
    wkda_net_500_file, comment="#", header=0, sep='\t', low_memory=False
)

# Overlap file

wkda_overlap_file = "/Users/anb/Documents/CMEB-Lab/ESCC-Drug-Repositioning/DR-escc-data/output/GSE23400/1st-dual-arm-analysis/wKDA-arm/overlap/gklPh0nN48.KDA2PHARM_app3result.txt"

wkda_overlap = pd.read_csv(
    wkda_overlap_file, comment="#", header=0, sep='\t', low_memory=False
)


Filter the loaded in datasets:
- GSEA_net_meta
- GSEA_net_500
- GSEA_overlap
- wkda_net_meta
- wkda_net_500
- wkda_overlap

Staring with GSEA_net_meta, make sure the data is loaded in correctly.

In [None]:
GSEA_net_meta

As we are only interested in human data, first check for non-human data and remove if needed.

In [None]:
GSEA_net_meta[GSEA_net_meta["Species"] != "Homo sapiens"]

Sort the df according to drug and rank

In [None]:
GSEA_net_meta_sorted_drug = GSEA_net_meta.sort_values(['Drug', 'Rank'], axis=0, ascending=False, inplace=False)

In [None]:
GSEA_net_meta_sorted_drug

Remove the duplicates of drugs, keeping only the first occurrence as it as the highest p-value

In [None]:
GSEA_net_meta_sorted_drug.drop_duplicates(subset = ["Drug"], inplace=True)

In [None]:
GSEA_net_meta_sorted_drug

Sort the df according to Rank.

In [None]:
GSEA_net_meta_sorted_rank = GSEA_net_meta_sorted_drug.sort_values(['Rank'], axis=0, ascending=False, inplace=False)

In [None]:
GSEA_net_meta_sorted_rank

In [None]:
GSEA_net_meta_sorted_rank.reset_index(inplace=True,drop=True)

In [None]:
GSEA_net_meta_sorted_rank

In [None]:
GSEA_net_meta_filt_results = GSEA_net_meta_sorted_rank.copy()

Next, with GSEA_net_500, make sure the data is loaded in correctly.

In [None]:
GSEA_net_500

Remove non-significant results

In [None]:
GSEA_net_500_sig = GSEA_net_500[GSEA_net_500["Species"] == 'Homo sapiens']
GSEA_net_500_sig = GSEA_net_500_sig[GSEA_net_500_sig["P value"] <= 0.05]

Remove entries with z score equal to zero as they are problematic

In [None]:
GSEA_net_500_sig_z0 = GSEA_net_500_sig[GSEA_net_500_sig["Z score"] != 0]
GSEA_net_500_sig_z0 = GSEA_net_500_sig[GSEA_net_500_sig["Z score"] < -2]

In [None]:
GSEA_net_500_sig_z0

Sort the df according to drug and rank

In [None]:
GSEA_net_500_sig_z0_sorted_drug = GSEA_net_500_sig_z0.sort_values(['Drug', 'P value', 'Z score'], axis=0, ascending=True, inplace=False)

In [None]:
GSEA_net_500_sig_z0_sorted_drug

View the top result, Etoposide, to make sure the df is sorted correctly, and the most significant result is at the top of the 'Etoposide's list in the df

In [None]:
GSEA_net_500_sig_z0_sorted_drug[GSEA_net_500_sig_z0_sorted_drug['Drug'] == 'Etoposide']

Remove the duplicates of drugs, keeping only the first occurrence as it as the highest pvalue

In [None]:
GSEA_net_500_sig_z0_sorted_drug.drop_duplicates(subset = ["Drug"], inplace=True)

In [None]:
GSEA_net_500_sig_z0_sorted_drug

Sort the df according to Rank.

In [None]:
GSEA_net_500_sig_z0_sorted_sorted_rank = GSEA_net_500_sig_z0_sorted_drug.sort_values(['Z score', 'Z score rank', 'P value'], axis=0, ascending=True, inplace=False)

In [None]:
GSEA_net_500_sig_z0_sorted_sorted_rank

In [None]:
GSEA_net_500_sig_z0_sorted_sorted_rank.reset_index(inplace=True,drop=True)

In [None]:
GSEA_net_500_filt_results = GSEA_net_500_sig_z0_sorted_sorted_rank.copy()

In [None]:
GSEA_net_500_filt_results

Finally, for GSEA, filter overlap

In [None]:
GSEA_overlap

In [None]:
GSEA_overlap_sig = GSEA_overlap[GSEA_overlap["P value"] <= 0.05]

In [None]:
GSEA_overlap_sig = GSEA_overlap_sig[GSEA_overlap_sig['Within Species Rank'] >= 0]

In [None]:

GSEA_overlap_sig_hs = GSEA_overlap_sig[GSEA_overlap_sig['Species'] == 'Homo sapiens']

In [None]:
GSEA_overlap_sig_hs

Filter the results to select only those with a Jaccard score above the mean.

In [None]:
GSEA_overlap_sig_hs['Jaccard Score'].mean()

In [None]:
GSEA_overlap_sig_hs_js = GSEA_overlap_sig_hs[GSEA_overlap_sig_hs['Jaccard Score'] > 4.738798e-18]

In [None]:
GSEA_overlap_sig_hs_js

In [None]:
GSEA_overlap_sig_hs_sorted_drug = GSEA_overlap_sig_hs_js.sort_values(['Drug', 'Within Species Rank', 'Jaccard Score', 'P value'], axis=0, ascending=False, inplace=False)

In [None]:
GSEA_overlap_sig_hs_sorted_drug

In [None]:
GSEA_overlap_sig_hs_sorted_drug.drop_duplicates(subset = ["Drug"], inplace=True)

In [None]:
GSEA_overlap_sig_hs_sorted_rank = GSEA_overlap_sig_hs_sorted_drug.sort_values(['Within Species Rank', 'Jaccard Score', 'P value'], axis=0, ascending=False, inplace=False)

In [None]:
GSEA_overlap_sig_hs_sorted_rank

In [None]:
GSEA_overlap_sig_hs_sorted_rank.reset_index(inplace=True,drop=True)

In [None]:
GSEA_overlap_sig_hs_sorted_rank

In [None]:
GSEA_overlap_filt_results = GSEA_overlap_sig_hs_sorted_rank.copy()

Filtered result dfs used in the subsequent steps:
- GSEA_net_meta_filt_results
- GSEA_net_500_filt_results
- GSEA_overlap_filt_results

In [None]:
GSEA_net_meta_filt_results.head()

In [None]:
GSEA_net_500_filt_results.head()

In [None]:
GSEA_overlap_filt_results.head()

In [None]:
GSEA_overlap_results = GSEA_overlap_filt_results.copy()
GSEA_net_500_results = GSEA_net_500_filt_results[['Drug', 'Species', 'Tissue']].copy()
GSEA_net_meta_results = GSEA_net_meta_filt_results[['Drug',	'Species', 'Tissue', 'Dataset']].copy()

In [None]:
# GSEA_overlap_results = GSEA_overlap_results.add_suffix('_frm_G_ovrlp')

GSEA_net_500_results = GSEA_net_500_results.add_suffix('_frm_G_nt500')
GSEA_net_meta_results = GSEA_net_meta_results.add_suffix('_frm_G_ntmeta')

In [None]:
GSEA_overlap_results.head()

In [None]:
GSEA_net_500_results.head()

In [None]:
GSEA_net_meta_results.head()

In [None]:
GSEA_common_network_drugs = pd.merge(left=GSEA_net_500_results, right=GSEA_net_meta_results, how='inner', left_on='Drug_frm_G_nt500', right_on='Drug_frm_G_ntmeta')

In [None]:
GSEA_common_network_drugs

In [None]:
GSEA_common_drugs_temp = pd.merge(left=GSEA_overlap_results, right=GSEA_common_network_drugs, how='inner', left_on='Drug', right_on='Drug_frm_G_nt500')

In [None]:
GSEA_common_drugs = GSEA_common_drugs_temp[['Database', 'Method', 'Drug', 'Species', 'Tissue or Cell Line', 'Study', 'Dose', 'Time', 'Jaccard Score', 'Odds Ratio', 'P value', 'Within Species Rank', 'Overlap']].copy()

In [None]:
GSEA_common_drugs

Now we will move onto the wkda arm.

Staring with wkda_net_meta, make sure the data is loaded in correctly.

In [None]:
wkda_net_meta

As we are only interested in human data, first check for non-human data and remove if needed.

In [None]:
wkda_net_meta[wkda_net_meta["Species"] != "Homo sapiens"]

Sort the df according to drug and rank

In [None]:
wkda_net_meta_sorted_drug = wkda_net_meta.sort_values(['Drug', 'Rank'], axis=0, ascending=False, inplace=False)

In [None]:
wkda_net_meta_sorted_drug

Remove the duplicates of drugs, keeping only the first occurrence as it as the highest p-value

In [None]:
wkda_net_meta_sorted_drug.drop_duplicates(subset = ["Drug"], inplace=True)

In [None]:
wkda_net_meta_sorted_drug

Sort the df according to Rank.

In [None]:
wkda_net_meta_sorted_rank = wkda_net_meta_sorted_drug.sort_values(['Rank'], axis=0, ascending=False, inplace=False)

In [None]:
wkda_net_meta_sorted_rank

In [None]:
wkda_net_meta_sorted_rank.reset_index(inplace=True,drop=True)

In [None]:
wkda_net_meta_sorted_rank

In [None]:
wkda_net_meta_filt_results = wkda_net_meta_sorted_rank.copy()

Next, with wkda_net_500, make sure the data is loaded in correctly.

In [None]:
wkda_net_500

Remove non-significant results

In [None]:
wkda_net_500_sig = wkda_net_500[wkda_net_500["Species"] == 'Homo sapiens']
wkda_net_500_sig = wkda_net_500_sig[wkda_net_500_sig["P value"] <= 0.05]

Remove entries with z score equal to zero as they are problematic

In [None]:
wkda_net_500_sig_z0 = wkda_net_500_sig[wkda_net_500_sig["Z score"] != 0]
wkda_net_500_sig_z0 = wkda_net_500_sig[wkda_net_500_sig["Z score"] < -2]

In [None]:
wkda_net_500_sig_z0

Sort the df according to drug and rank

In [None]:
wkda_net_500_sig_z0_sorted_drug = wkda_net_500_sig_z0.sort_values(['Drug', 'P value', 'Z score'], axis=0, ascending=True, inplace=False)

In [None]:
wkda_net_500_sig_z0_sorted_drug

View the top result, Etoposide, to make sure the df is sorted correctly, and the most significant result is at the top of the 'Etoposide's list in the df

In [None]:
wkda_net_500_sig_z0_sorted_drug[wkda_net_500_sig_z0_sorted_drug['Drug'] == 'Etoposide']

Remove the duplicates of drugs, keeping only the first occurrence as it as the highest pvalue

In [None]:
wkda_net_500_sig_z0_sorted_drug.drop_duplicates(subset = ["Drug"], inplace=True)

In [None]:
wkda_net_500_sig_z0_sorted_drug

Sort the df according to Rank.

In [None]:
wkda_net_500_sig_z0_sorted_sorted_rank = wkda_net_500_sig_z0_sorted_drug.sort_values(['Z score', 'Z score rank', 'P value'], axis=0, ascending=True, inplace=False)

In [None]:
wkda_net_500_sig_z0_sorted_sorted_rank

In [None]:
wkda_net_500_sig_z0_sorted_sorted_rank.reset_index(inplace=True,drop=True)

In [None]:
wkda_net_500_filt_results = wkda_net_500_sig_z0_sorted_sorted_rank.copy()

In [None]:
wkda_net_500_filt_results

Finally, for wkda, filter overlap

In [None]:
wkda_overlap

In [None]:
wkda_overlap_sig = wkda_overlap[wkda_overlap["P value"] <= 0.05]

In [None]:
wkda_overlap_sig = wkda_overlap_sig[wkda_overlap_sig['Within Species Rank'] >= 0]

In [None]:

wkda_overlap_sig_hs = wkda_overlap_sig[wkda_overlap_sig['Species'] == 'Homo sapiens']

In [None]:
wkda_overlap_sig_hs

Filter the results to select only those with a Jaccard score above the mean.

In [None]:
wkda_overlap_sig_hs['Jaccard Score'].mean()

In [None]:
wkda_overlap_sig_hs_js = wkda_overlap_sig_hs[wkda_overlap_sig_hs['Jaccard Score'] > 0.015192708293416539]

In [None]:
wkda_overlap_sig_hs_js

In [None]:
wkda_overlap_sig_hs_sorted_drug = wkda_overlap_sig_hs_js.sort_values(['Drug', 'Within Species Rank', 'Jaccard Score', 'P value'], axis=0, ascending=False, inplace=False)

In [None]:
wkda_overlap_sig_hs_sorted_drug

In [None]:
wkda_overlap_sig_hs_sorted_drug.drop_duplicates(subset = ["Drug"], inplace=True)

In [None]:
wkda_overlap_sig_hs_sorted_rank = wkda_overlap_sig_hs_sorted_drug.sort_values(['Within Species Rank', 'Jaccard Score', 'P value'], axis=0, ascending=False, inplace=False)

In [None]:
wkda_overlap_sig_hs_sorted_rank

In [None]:
wkda_overlap_sig_hs_sorted_rank.reset_index(inplace=True,drop=True)

In [None]:
wkda_overlap_sig_hs_sorted_rank

In [None]:
wkda_overlap_filt_results = wkda_overlap_sig_hs_sorted_rank.copy()

testing start

In [None]:
GSEA_overlap_filt_results

In [None]:
wkda_overlap_filt_results

In [None]:
overlapcommon = pd.merge(left=GSEA_overlap_filt_results, right=wkda_overlap_filt_results, how='inner', left_on='Drug', right_on='Drug')

In [None]:
overlapcommon

testing end

Filtered result dfs used in the subsequent steps:
- wkda_net_meta_filt_results
- wkda_net_500_filt_results
- wkda_overlap_filt_results

In [None]:
wkda_net_meta_filt_results.head()

In [None]:
wkda_net_500_filt_results.head()

In [None]:
wkda_overlap_filt_results.head()

In [None]:
wkda_overlap_results = wkda_overlap_filt_results.copy()
wkda_net_500_results = wkda_net_500_filt_results[['Drug', 'Species', 'Tissue']].copy()
wkda_net_meta_results = wkda_net_meta_filt_results[['Drug',	'Species', 'Tissue', 'Dataset']].copy()

In [None]:
# wkda_overlap_results = wkda_overlap_results.add_suffix('_frm_w_ovrlp')

wkda_net_500_results = wkda_net_500_results.add_suffix('_frm_w_nt500')
wkda_net_meta_results = wkda_net_meta_results.add_suffix('_frm_w_ntmeta')

In [None]:
wkda_overlap_results.head()

In [None]:
wkda_net_500_results.head()

In [None]:
wkda_net_meta_results.head()

In [None]:
wkda_common_network_drugs = pd.merge(left=wkda_net_500_results, right=wkda_net_meta_results, how='inner', left_on='Drug_frm_w_nt500', right_on='Drug_frm_w_ntmeta')

In [None]:
wkda_common_network_drugs

In [None]:
wkda_common_drugs_temp = pd.merge(left=wkda_overlap_results, right=wkda_common_network_drugs, how='inner', left_on='Drug', right_on='Drug_frm_w_nt500')

In [None]:
wkda_common_drugs = wkda_common_drugs_temp[['Database', 'Method', 'Drug', 'Species', 'Tissue or Cell Line', 'Study', 'Dose', 'Time', 'Jaccard Score', 'Odds Ratio', 'P value', 'Within Species Rank', 'Overlap']].copy()

Now that the drugs from each arm of the analysis have been extracted, the results from both arms will be compared to extract only drugs common to both arms

In [None]:
GSEA_common_drugs

In [None]:
wkda_common_drugs

In [None]:
GSEA_common_drugs = GSEA_common_drugs.add_suffix('_GSEA')

In [None]:
wkda_common_drugs.head()

In [None]:
common_drugs_temp = pd.merge(left=wkda_common_drugs, right=GSEA_common_drugs, how='inner', left_on='Drug', right_on='Drug_GSEA')

In [None]:
common_drugs_temp['Average Rank'] = common_drugs_temp[['Within Species Rank', 'Within Species Rank_GSEA']].mean(axis=1)

In [None]:
common_drugs_temp

In [None]:
common_drugs = common_drugs_temp[['Database', 'Method', 'Drug', 'Average Rank', 'Species', 'Tissue or Cell Line', 'Study', 'Dose', 'Time', 'Jaccard Score', 'Odds Ratio', 'P value']].copy()

In [None]:
common_drugs

In [None]:
Results = common_drugs[['Drug','P value', 'Jaccard Score', 'Odds Ratio', 'Average Rank', 'Method', 'Dose', 'Time']].copy()

In [None]:
Results.sort_values(['P value', ], axis=0, ascending=True, inplace=True)
Results.reset_index(drop=True, inplace=True)

In [None]:
Results

In [None]:
out_dir = "/Users/anb/Documents/CMEB-Lab/ESCC-Drug-Repositioning/DR-escc-data/output/GSE23400/1st-dual-arm-analysis/analysis-both-arms"

Results.to_csv(os.path.join(out_dir, "Common_Drugs.tsv"), sep='\t', index=False)