In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
def save_secondary_file(filename,df):
    if not os.path.exists('../Output Files/Secondary TR List'):
        os.mkdir('../Output Files/Secondary TR List')
    
    df.to_csv("../Output Files/Secondary TR List/"+filename+".csv",index=False)

In [3]:
def save_primary_file(filename,df):
    if not os.path.exists('../Output Files/Primary TR List'):
        os.mkdir('../Output Files/Primary TR List')
    
    df.to_csv("../Output Files/Primary TR List/"+filename+".csv",index=False)

In [4]:
def save_TREA(filename,df):
    if not os.path.exists('../Output Files/TR List'):
        os.mkdir('../Output Files/TR List')
    
    df.to_csv("../Output Files/TR List/"+filename+".csv",index=False)

In [5]:
def create_logFC_mapping(df):
    gene_to_logFC = []
    genes = df["Gene.Symbol"].unique()
    for gene in genes:
        try:
            temp  = df.loc[df["Gene.Symbol"] == gene]
            gene_to_logFC.append((gene,temp["logFC"].sum()/len(temp)))
        except Exception as e:
            print(e)
    return pd.DataFrame.from_records(gene_to_logFC,columns=["Gene.Symbol","logFC"])

In [6]:
def clean_database_df(df):
    cleaned_data = []
    terms = df["term"].unique()
    for term in terms:
        try:
            temp  = df.loc[df["term"] == term]
            cleaned_data.append((term,';'.join(temp['genes'])))
        except Exception as e:
            print(e)
    return pd.DataFrame.from_records(cleaned_data,columns=["term","genes"])

In [7]:
def get_unique_genes(df):
    df['genes'] = df['genes'].apply(lambda x: ';'.join(list(set(x.split(';')))))
    df['genes'] = df['genes'].apply(lambda x: x[1:] if x[0]==';' else x)
    return df

In [8]:
def load_database_files(disease):
    try:
        df_temp = pd.read_csv('../Sample Files/Chea files/Chea TR_'+disease[0]+'_Clean.csv')[['term','genes']]
    except:
        df_temp = pd.read_excel('../Sample Files/Chea files/Chea TR_'+disease[0]+'_Clean.xlsx')[['term','genes']]
    df_chea = pd.DataFrame()
    df_chea['term'] = df_temp["term"].str.capitalize().str.strip()
    df_chea['genes'] = df_temp['genes']
    df_chea = clean_database_df(df_chea)
    df_chea['source'] = "Chea"
    
    try:
        df_temp = pd.read_csv('../Sample Files/ingenuity files/ingenuity TR_'+disease[0]+'_clean.csv')[['term','genes']]
    except:
        df_temp = pd.read_excel('../Sample Files/ingenuity files/ingenuity TR_'+disease[0]+'_clean.xlsx')[['term','genes']]
    df_ipa = pd.DataFrame()
    df_ipa['term'] = df_temp["term"].str.capitalize().str.strip()
    df_ipa['genes'] = df_temp['genes']
    df_ipa = clean_database_df(df_ipa)
    df_ipa['source'] = "IPA"
    
    if (disease[0] != "MC"):
        try:
            df_temp = pd.read_csv('../Sample Files/JASPAR-TRANSFAC files/JASPAR-TRANSFAC TR_'+disease[0]+'_Clean.csv')[['term','genes']]
        except:
            df_temp = pd.read_excel('../Sample Files/JASPAR-TRANSFAC files/JASPAR-TRANSFAC TR_'+disease[0]+'_Clean.xlsx')[['term','genes']]
        df_jasper = pd.DataFrame()
        df_jasper['term'] = df_temp["term"].str.capitalize().str.strip()
        df_jasper['genes'] = df_temp['genes']
        df_jasper = clean_database_df(df_jasper)
        df_jasper['source'] = "JASPAR-TRANSFAC"
    else:
        df_jasper = pd.DataFrame(columns=["term","genes","source"])
    return df_chea, df_ipa, df_jasper

In [9]:
def clean_genes(df):
    df_deg = pd.read_csv("../Sample Files/DEG/"+disease[0]+'.csv')[["logFC","Refseq_ID"]].rename(columns={'Refseq_ID':'term'})
    df = df.drop(['logFC'],axis=1)
    df["index1"] = df.index
    df = (df.set_index(['index1','term'])
    .stack()
    .str.split(';',expand=True)
    .stack()
    .unstack(-2)
    .reset_index(-1,drop = True)
    .reset_index()
    .drop("index1",axis = 1))

    df["target.uniq"] = df["genes"].str.capitalize().str.strip()
    df = df.drop(['genes','source'],axis=1)
    df = df.merge(df_deg, how='left', left_on='target.uniq', right_on='term').drop("term",axis=1)
    return df

In [10]:
def find_secondary_trs(disease,df_chea, df_ipa, df_jasper):
    df_union = pd.concat([df_chea,df_ipa,df_jasper])
    df_union = df_union.groupby(df_union['term']).aggregate(";".join)
    df_union.reset_index(inplace=True)
    df_union["term"] = df_union["term"].str.capitalize().str.strip()
    df_deg = pd.read_csv("../Sample Files/DEG/"+disease[0]+'.csv')[["logFC","Refseq_ID"]].rename(columns={'Refseq_ID':'term'})
    df_deg["term"] = df_deg["term"].str.capitalize().str.strip()
    return df_deg.merge(df_union, how='inner', left_on='term', right_on='term')

In [11]:
def find_primary_trs(disease, df_chea, df_ipa, df_jasper):
    df_ipa_chea = df_ipa.merge(df_chea, how='inner',left_on='term',right_on='term')
    df_ipa_chea['source'] = 'Chea'
    df_ipa_chea['genes'] = df_ipa_chea['genes_x'] +';' +df_ipa_chea['genes_y']
    df_ipa_chea['genes'] = df_ipa_chea['genes'].apply(lambda x: ';'.join(list(set(x.split(';')))))
    df_ipa_chea.drop(['genes_x', 'genes_y','source_y','source_x'], axis=1,inplace=True)
    
    df_ipa_jasper = df_ipa.merge(df_jasper, how='inner',left_on='term',right_on='term')
    df_ipa_jasper['source'] = 'JASPAR-TRANSFAC'
    df_ipa_jasper['genes'] = df_ipa_jasper['genes_x'] +';' +df_ipa_jasper['genes_y']
    df_ipa_jasper.fillna('', inplace=True)
    df_ipa_jasper['genes'] = df_ipa_jasper['genes'].apply(lambda x: ';'.join(list(set(x.split(';')))))
    df_ipa_jasper.drop(['genes_x', 'genes_y','source_y','source_x'], axis=1,inplace=True)
    
    df_primary = df_ipa_jasper.merge(df_ipa_chea, how='outer',left_on='term',right_on='term')
    df_primary.fillna('', inplace=True)
    df_primary['IPA'] = 'IPA'
    df_primary['source'] = df_primary[['IPA','source_x', 'source_y']].apply(lambda x: ';'.join(x), axis=1)
    df_primary['genes'] = df_primary[['genes_x', 'genes_y']].apply(lambda x: ';'.join(x), axis=1)

    return df_primary.drop(['genes_x', 'genes_y','source_y','source_x','IPA'], axis=1)

In [12]:
def clean_primary(df_primary):
    df = df_primary[['term','source']]
    df["index1"] = df.index
    df = (df.set_index(['index1','term'])
    .stack()
    .str.split(';;',expand=True)
    .stack()
    .unstack(-2)
    .reset_index(-1,drop = True)
    .reset_index()
    .drop("index1",axis=1))

    df = df.groupby(df['term']).aggregate(";".join)
    df.reset_index(inplace=True)
    return (df_primary.merge(df,how='left', left_on='term', right_on='term').drop("source_x",axis=1)
            .rename(columns={'source_y':'source'}))


In [13]:
def find_trs(disease):
    df_chea, df_ipa, df_jasper = load_database_files(disease)
    df_secondary = find_secondary_trs(disease,df_chea, df_ipa, df_jasper)
    df_secondary['term'] = df_secondary['term'].str.capitalize().str.strip()
    df_secondary.drop_duplicates(inplace=True)
    
    df_primary = find_primary_trs(disease,df_chea, df_ipa, df_jasper)
    
    df_primary = df_primary.merge(df_secondary[['term','logFC']],how='left',left_on='term',right_on='term')
    df_primary['term'] = df_primary['term'].str.capitalize().str.strip()

    df_primary.fillna(0, inplace=True)
    df_primary['genes'] = df_primary['genes'].apply(lambda x: x[1:] if x[0]==';' else x)
    df_primary['source'] = df_primary['source'].apply(lambda x: x[1:] if x[0]==';' else x)
    
    df_primary = clean_primary(df_primary)
    df_primary = get_unique_genes(df_primary)
    df_primary.drop_duplicates(inplace=True)
    secondary_term = list(set(list(df_secondary['term'].unique())) - set(list(df_primary['term'].unique())))
    
    df_secondary = df_secondary.loc[df_secondary['term'].isin(secondary_term)]
    df_secondary.reset_index()
    df_secondary = get_unique_genes(df_secondary)
    
    return df_secondary,df_primary

In [14]:

diseases ={
      'LPS_Cord':'LPS_Cord',
}

In [15]:
for disease in diseases.items():
    print(disease[0])
    df_secondary,df_primary = find_trs(disease)
    save_secondary_file(disease[1],df_secondary)
    save_primary_file(disease[1],df_primary)
    
#     When Type I and II
    save_TREA(disease[1],pd.concat([df_primary,df_secondary])[['term','genes','logFC','source']])
    
#     When only Type I
#     df_secondary(Type II) is not needed for Astro-non-enriched
#     save_TREA(disease[1],df_primary[['term','genes','logFC','source']])    

LPS_Cord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
