# Notebook that prepares the file to be read by the pipeline. It requieres the output of HotMaps.  

(default: hotspot_regions_gene_0.05.txt). 


## Imports

In [1]:
import pandas as pd
import numpy as np
import statsmodels.sandbox.stats.multicomp as mp

## Default files

In [2]:
file_hotspots_gene = "/home/fran/Documents/clusters3D/output_filtered2/hotspot_regions_gene_0.05.txt"
f_info_density = "/home/fran/Documents/clusters3D/output_filtered2/mtc_output_min_0.05.txt"


# 1. Read the input file

In [3]:
def find_hotspots_gene_cancer(file_hotspots_gene):
    #Structure	Cancer_Type	HotSpot1	HotSpot2...
    f = open(file_hotspots_gene)
    list_hotspots = []
    
    for line in f:
        line = line.rstrip()
        data = line.split('\t')
        gene = data[0]
        cancer = data[1]
        hotspots = data[2:len(data)]
        number = len(hotspots)
        # Loop over the hotspots, gets the average number of residuse per hotspot and the total number of residues involved in hotspots
        set_h = set()
        sizes_hotspots = []
        for hs in hotspots:
            aas = hs.split(";")
            size = 0
            # ENST00000263967:1047;ENST00000263967:1043	 (Transcript:RES)
            for aa in aas:
                
                if aa in set_h:
                    size = size +1
                else:
                    size = size +1
                    set_h.add(aa)
            sizes_hotspots.append(size)
        list_hotspots.append([gene,cancer,number,len(set_h),np.median(sizes_hotspots)])
        
    f.close()
    return pd.DataFrame(list_hotspots,columns=["GENE","Cancer_Type","TOTAL_HSP","TOTAL_RES_HSP","MEDIAN_RES_HSP"])
df_hotspots = find_hotspots_gene_cancer(file_hotspots_gene)

# 2. Read the density file

In [6]:
def get_density_and_pvalue_gene(file_info_density_gene):
    # Structure of the input file
    # Structure       Tumor Type      Model   Chain   Mutation Residues       Residue Mutation Count  Mutation Density        Hotspot P-value
    df_density_aa = pd.read_csv(file_info_density_gene,sep="\t")
    return df_density_aa

df_density_aa = get_density_and_pvalue_gene(f_info_density)
df_density_aa = df_density_aa.groupby(["HUGO Symbol","Sequence Ontology Transcript","Tumor Type"],as_index=False).agg({"Min p-value":np.min,"q-value":np.min})

# 3. Get the hotspots data, match it with the density file

In [7]:
def get_pvalue(row,df_density_aa):
    if row["Cancer_Type"] == "REF":
        return 0.0
    return np.min(df_density_aa[(df_density_aa["HUGO Symbol"]==row["GENE"])&(df_density_aa["Tumor Type"]==row["Cancer_Type"])]["Min p-value"].values)
def get_qvalue(row,df_density_aa):
    if row["Cancer_Type"] == "REF":
        return 0.0
    return np.min(df_density_aa[(df_density_aa["HUGO Symbol"]==row["GENE"])&(df_density_aa["Tumor Type"]==row["Cancer_Type"])]["q-value"].values)
df_hotspots["Min p-value"] = df_hotspots.apply(lambda row: get_pvalue(row,df_density_aa),axis=1)
df_hotspots["q-value"] = df_hotspots.apply(lambda row: get_qvalue(row,df_density_aa),axis=1)
    

# 4. Include the data of genes non-significant

In [8]:
rows = []
for index,row in df_density_aa.iterrows():
    if df_hotspots[(df_hotspots["GENE"]==row["HUGO Symbol"])&(df_hotspots["Cancer_Type"]==row["Tumor Type"])].shape[0] == 0:
        # Include it
        rows.append([row["HUGO Symbol"],row["Tumor Type"],0,0,0,row["Min p-value"],row["q-value"]])
df_non_significant = pd.DataFrame(rows,columns=df_hotspots.columns.values)

# 5. Concat both DataFrames

In [9]:
df_hotspots_all = pd.concat([df_hotspots,df_non_significant])

# 6. Save it

In [8]:
df_hotspots_all = pd.read_csv("/home/fran/Documents/clusters3D/output_filtered2/parsed_hotspot_output_signatures_filtered.csv",sep="\t")

In [10]:
df_hotspots_all.to_csv("/home/fran/Documents/clusters3D/output_filtered2/parsed_hotspot_output_signatures_filtered.csv",sep="\t",index=False)

In [18]:
df_hotspots_all2 = pd.read_csv("/home/fran/Documents/clusters3D/output/parsed_hotspot_output_full.csv",sep="\t")

In [10]:
df_hotspots_all[df_hotspots_all["Cancer_Type"]=="PAAD"].sort_values("q-value").head(15)

Unnamed: 0,GENE,Cancer_Type,TOTAL_HSP,TOTAL_RES_HSP,MEDIAN_RES_HSP,Min p-value,q-value
160,TP53,PAAD,1,32,32.0,3e-06,0.001271
161,KRAS,PAAD,1,3,3.0,4e-06,0.001271
159,GNAS,PAAD,1,1,1.0,1.8e-05,0.002278
157,SMAD4,PAAD,1,4,4.0,5.3e-05,0.004549
158,KCNB2,PAAD,1,1,1.0,0.000655,0.039401
33011,DOCK1,PAAD,0,0,0.0,0.016762,0.788544
89320,PIK3CG,PAAD,0,0,0.0,0.019669,0.886737
88645,PHIP,PAAD,0,0,0.0,1.0,1.0
88678,PHKA2,PAAD,0,0,0.0,1.0,1.0
88702,PHKB,PAAD,0,0,0.0,1.0,1.0
