In [None]:
#Import required functions
import GEOparse
import pandas as pd
import numpy as np
import os
import json
from sklearn.preprocessing import quantile_transform
from sklearn.decomposition import PCA
import warnings
from scipy.stats import chi2
from scipy.stats.mstats import zscore
import time
#Time sleep to prevent crashes
time.sleep(1)
#Change this to your working directory
WORKDIR = '/Users/MaayanLab/Desktop/Allie'

In [None]:
#Define merge function for combining sample lists
def merge(dict1, dict2): 
        res = {**dict1, **dict2} 
        return res

In [None]:
#Define characteristic direction function
def chdir(data, sampleclass, genes, gamma=1., sort=True, calculate_sig=False, nnull=10, sig_only=False, norm_vector=True):
        data.astype(float)
        #sampleclass = np.array(map(int, sampleclass))
        m_non0 = sampleclass != 0
        m1 = sampleclass[m_non0] == 1
        m2 = sampleclass[m_non0] == 2

        data = data[:, m_non0]
        data = zscore(data)

        n1 = m1.sum() # number of controls
        n2 = m2.sum() # number of experiments

        meanvec = data[:,m2].mean(axis=1) - data[:,m1].mean(axis=1) 

        pca = PCA(n_components=None)
        pca.fit(np.array(data.T))

        cumsum = pca.explained_variance_ratio_
        keepPC = len(cumsum[cumsum > 0.001])

        v = pca.components_[0:keepPC].T
        r = pca.transform(data.T)[:,0:keepPC]
        dd = ( np.dot(r[m1].T,r[m1]) + np.dot(r[m2].T,r[m2]) ) / float(n1+n2-2)
        sigma = np.mean(np.diag(dd))

        shrunkMats = np.linalg.inv(gamma*dd + sigma*(1-gamma)*np.eye(keepPC))
        b = np.dot(v, np.dot(np.dot(v.T, meanvec), shrunkMats))

        if norm_vector:
            b /= np.linalg.norm(b)

        grouped = zip([abs(item) for item in b],b,genes)
        if sort:
            grouped = sorted(grouped,key=lambda x: x[0], reverse=True)

        if not calculate_sig: # return sorted b and genes.
            res = [(item[1],item[2]) for item in grouped]
            return res
        else: # generate a null distribution of chdirs
            nu = n1 + n2 - 2
            y1 = np.random.multivariate_normal(np.zeros(keepPC), dd, nnull).T * np.sqrt(nu / chi2.rvs(nu,size=nnull))
            y2 = np.random.multivariate_normal(np.zeros(keepPC), dd, nnull).T * np.sqrt(nu / chi2.rvs(nu,size=nnull))
            y = y2 - y1

            nullchdirs = []
            for col in y.T:
                bn = np.dot(np.dot(np.dot(v,shrunkMats), v.T), np.dot(col,v.T))
                bn /= np.linalg.norm(bn)
                bn = bn ** 2
                bn.sort()
                bn = bn[::-1] ## sort in decending order
                nullchdirs.append(bn)

            nullchdirs = np.array(nullchdirs).T
            nullchdirs = nullchdirs.mean(axis=1)
            b_s = b ** 2 
            b_s.sort()
            b_s = b_s[::-1] # sorted b in decending order
            relerr = b_s / nullchdirs ## relative error
            # ratio_to_null
            ratios = np.cumsum(relerr)/np.sum(relerr)- np.linspace(1./len(meanvec),1,len(meanvec))
            res = [(item[1],item[2], ratio) for item, ratio in zip(grouped, ratios)] 
            print('Number of significant genes: %s'%(np.argmax(ratios)+1))
            if sig_only:
                return res[0:np.argmax(ratios)+1]
            else:
                return res

In [None]:
#Microarray Analysis Pipeline
def micro_analysis(accession_id, control_samples, treated_samples):
    #Creating a dictionary of assigned control and treated samples 
      
    control_samples = { i : 'control' for i in control_samples }
    treated_samples = { i : 'treated' for i in treated_samples }
    all_samples = merge(control_samples, treated_samples)
    
    #Parse the GEO data using the Accession ID
    gse = GEOparse.get_GEO(geo=accession_id, destdir="./")
    #Create a list of samples to use in the development of the expression matrix
    list_samples = list(all_samples.keys())
    
    #Visualization of expression matrix
    pivoted_samples = gse.pivot_samples('VALUE')[list_samples]
    pivoted_samples.head()
    #Determine the total amount of probes used in the study
    pivoted_samples_average = pivoted_samples.median(axis=1)
    #Filtering out unexpressed probes
    expression_threshold = pivoted_samples_average.quantile(0.3)
    expressed_probes = pivoted_samples_average[pivoted_samples_average >= expression_threshold].index.tolist()
    
    #Redefine expression data using only the expressed probes
    exprsdata = gse.pivot_samples("VALUE").loc[expressed_probes]
    exprsdata = exprsdata.T
    #Deletes additional samples that aren't being analyzed
    exprsdata = exprsdata[exprsdata.index.isin(list_samples)]
    #Drop any probe columns where expression data is missing or negative
    exprsdata.dropna(axis = 1)
    
    #Quantile normalization of data
    rank_mean = exprsdata.stack().groupby(exprsdata.rank(method='first').stack().astype(int)).mean()
    exprsdata.rank(method='min').stack().astype(int).map(rank_mean).unstack().dropna(axis=1)
    #Making Dataframe of samples
    samplesDf = pd.DataFrame.from_dict(all_samples, orient = 'index', columns = ['type'])
    samplesDf.reset_index(inplace=True)
    
    #Transpose data matrix for sorting, index correlated to probe IDs
    exprsdata = exprsdata.T
    #Upload annotation file as dictionary
    dict1 = {}
    with open(WORKDIR + '/probe2gene.txt') as f:
        for line in f:
            line = line.strip()
            (platform, probe, symbol) = line.split()
            dict1[probe] = symbol
    #Reset index and replace with gene symbols, view as dataframe
    exprsdata = pd.DataFrame(exprsdata)
    exprsdata.index = exprsdata.index.astype(str, copy=False)
    exprsdata['symbol'] = exprsdata.index.to_series().map(dict1)
    exprsdata.reset_index(inplace=True)
    data = exprsdata.set_index('symbol')
    
    #Drop probe id column
    data = data.drop('ID_REF', axis=1)
    #Drop rows that aren't associated with a particular gene symbol
    data = data.reset_index().dropna().set_index('symbol')
    
    #Utilize warning statements
    warnings.filterwarnings("ignore", category=DeprecationWarning) 
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    
    #Make sample classes, ensure that there is a distinction between control/treated samples
    data_cd = {}
    sample_classes = {}
    sample_class = np.zeros(data.shape[1], dtype=np.int32)
    sample_class[samplesDf['type'].values == 'control'] = 1
    sample_class[samplesDf['type'].values == 'treated'] = 2
    sample_classes = sample_class
    
    #CD results
    cd_res = chdir(data.values, sample_classes, data.index, gamma=.5, sort=False, calculate_sig=False)
    cd_coefs = np.array(list(map(lambda x: x[0], cd_res)))
    srt_idx = np.abs(cd_coefs).argsort()[::-1]
    cd_coefs = cd_coefs[srt_idx][:600]
    sorted_DEGs = data.index[srt_idx][:600]
    up_genes = dict(zip(sorted_DEGs[cd_coefs > 0], cd_coefs[cd_coefs > 0]))
    dn_genes = dict(zip(sorted_DEGs[cd_coefs < 0], cd_coefs[cd_coefs < 0]))
    data_cd['up'] = up_genes
    data_cd['dn'] = dn_genes
    
    #Retrieve up and down gene sets
    up_list = list(up_genes.keys())
    dn_list = list(dn_genes.keys())
    #Up genes and down genes
    return up_list, dn_list


In [None]:
#Read metadata file
metadata = pd.read_csv(WORKDIR + '/example_metadata.csv')
metadata

In [None]:
#Run analysis over studies in the metadata file, printing a list of the up and down genes
up_data = {}
dn_data = {}
for i in range(0,len(metadata.index)):
    accession_id = metadata.iloc[i]['GEO Accession Number']
    control_samples = metadata.iloc[i]['Control Samples']
    treated_samples = metadata.iloc[i]['Treated Samples']
    cell_type = str(metadata.iloc[i]['Cell Type'])
    phys = str(metadata.iloc[i]['Altered Condition'])
    drug = str(metadata.iloc[i]['Drug Name'])
    gene = str(metadata.iloc[i]['Name of the Perturbed Gene'])
    gene_type = str(metadata.iloc[i]['Gene Alteration'])
    platform = str(metadata.iloc[i]['GEO Platform'])
    metadata_list = [platform, cell_type, phys, drug, gene, gene_type]
    control_samples = control_samples.split(',')
    control_samples = [c.strip(' ') for c in control_samples]
    treated_samples = treated_samples.split(',')
    treated_samples = [t.strip(' ') for t in treated_samples]
    DEGs = micro_analysis(accession_id, control_samples, treated_samples)
    up_genes, dn_genes = DEGs
    up_data[accession_id + ' ' + str(metadata_list) + ' up'] = up_genes
    dn_data[accession_id + str(metadata_list) + ' dn'] = dn_genes
    print(up_data, dn_data)

In [None]:
#Merged up and down data
updn_data = merge(up_data, dn_data)
#Extract labels from lists of up/down genes
updn_terms = [k for k,v in updn_data.items()]
#Extract gene sets
updnval = [updn_data[k] for k in updn_terms]

In [None]:
#Final version of the desired format of  labels
final_terms = []
for term in updn_terms:
    accession = term.split('[')[0].strip()
    platform = term.split('[')[1].split(', ')[0].replace("'",'')
    cell_type = term.split('[')[1].split(', ')[1].replace("'",'')
    physical_alt = term.split('[')[1].split(', ')[2].replace("'",'')
    chemical_alt = term.split('[')[1].split(', ')[3].replace("'",'')
    genetic_alt = term.split('[')[1].split(', ')[4].replace("'",'')
    genetic_alt_type = term.split('[')[1].split(', ')[5].split(']')[0].replace("'",'')
    effect = term.split(']')[0].split(', ')[6].replace("'",'')
    updown = term.split('] ')[1].replace("'",'')
#Customize which categories of metadata to include on the labels
    final_terms.append(str([str(accession), str(effect), str(updown)]))

In [None]:
#Make new dictionary with the updated labels
micro_data = dict(zip(final_terms, updnval))

In [None]:
#Upload genes from autophagy geneshot search and sort by count*frac
geneshot = pd.read_csv('geneshot_genes.tsv', sep = '\t')
geneshot['count*frac'] = [geneshot['Publication count'].iloc[i] * geneshot['Fraction of publications from total gene publication'].iloc[i] for i in range(0,len(geneshot))]
geneshot = geneshot.sort_values(by=['count*frac'], ascending = False)
geneshot_list = list(geneshot['Gene'])[:300]
#Put geneshot results into dictionary
geneshot_dict = {}
geneshot_dict[str(['Geneshot Autophagy Search', 'nan', 'nan'])] = geneshot_list

#Upload predicted genes from autophagy geneshot search and make dictionaries
geneshot1 = pd.read_csv('autorif_predictions.tsv', sep = '\t')
geneshot_list1 = list(geneshot1['Gene'])
geneshot_dict1 = {}
geneshot_dict1[str(['Geneshot AutoRif Predictions', 'nan', 'nan'])] = geneshot_list1

geneshot2 = pd.read_csv('generif_predictions.tsv', sep = '\t')
geneshot_list2 = list(geneshot2['Gene'])
geneshot_dict2 = {}
geneshot_dict2[str(['Geneshot GeneRif Predictions', 'nan', 'nan'])] = geneshot_list2

geneshot3 = pd.read_csv('enrichr_predictions.tsv', sep = '\t')
geneshot_list3 = list(geneshot3['Gene'])
geneshot_dict3 = {}
geneshot_dict3[str(['Geneshot Enrichr Predictions', 'nan', 'nan'])] = geneshot_list3

geneshot4 = pd.read_csv('tagger_predictions.tsv', sep = '\t')
geneshot_list4 = list(geneshot4['Gene'])
geneshot_dict4 = {}
geneshot_dict4[str(['Geneshot Tagger Predictions', 'nan', 'nan'])] = geneshot_list4

geneshot5 = pd.read_csv('archs4_predictions.tsv', sep = '\t')
geneshot_list5 = list(geneshot5['Gene'])
geneshot_dict5 = {}
geneshot_dict5[str(['Geneshot ARCHS4 Predictions', 'nan', 'nan'])] = geneshot_list5

#Merge all geneshot dictionaries
geneshot01 = merge(geneshot_dict, geneshot_dict1)
geneshot23 = merge(geneshot_dict2, geneshot_dict3)
geneshot45 = merge(geneshot_dict4, geneshot_dict5)
geneshot03 = merge(geneshot01, geneshot23)
all_geneshot = merge(geneshot03, geneshot45)


In [None]:
#Upload 3 sets if DEG from most relevant biojupies studies
biojupies1 = pd.read_csv(WORKDIR + '/biojupies_genes1.tsv', sep = '\t')
biojupies2 = pd.read_csv(WORKDIR + '/biojupies_genes2.tsv', sep = '\t')
biojupies3 = pd.read_csv(WORKDIR + '/biojupies_genes3.tsv', sep = '\t')

#Sort into up and down sets
biojupies1_up = biojupies1[biojupies1['Up/Down'] == 'up']
biojupies1_dn = biojupies1[biojupies1['Up/Down'] != 'up']
biojupies2_up = biojupies2[biojupies2['Up/Down'] == 'up']
biojupies2_dn = biojupies2[biojupies2['Up/Down'] != 'up']
biojupies3_up = biojupies3[biojupies3['Up/Down'] == 'up']
biojupies3_dn = biojupies3[biojupies3['Up/Down'] != 'up']

#Make gene sets
bio1_up_list = list(biojupies1_up['Gene'])
bio1_dn_list = list(biojupies1_dn['Gene'])
bio2_up_list = list(biojupies2_up['Gene'])
bio2_dn_list = list(biojupies2_dn['Gene'])
bio3_up_list = list(biojupies3_up['Gene'])
bio3_dn_list = list(biojupies3_dn['Gene'])

#Make dictionaries for each gene set
bio1up = {}
bio1up[str(['GSE89672', 'Activate', 'up'])] = bio1_up_list
bio1dn = {}
bio1dn[str(['GSE89672', 'Activate', 'dn'])] = bio1_dn_list
bio2up = {}
bio2up[str(['GSE100888', 'Inhibit', 'up'])] = bio2_up_list
bio2dn = {}
bio2dn[str(['GSE100888', 'Inhibit', 'dn'])] = bio2_dn_list
bio3up = {}
bio3up[str(['GSE72091', 'Activate', 'up'])] = bio3_up_list
bio3dn = {}
bio3dn[str(['GSE72091', 'Activate', 'dn'])] = bio3_dn_list

#Merge all biojupies dictionaries
bio1 = merge(bio1up, bio1dn)
bio2 = merge(bio2up, bio2dn)
bio3 = merge(bio3up, bio3dn)
bio12 = merge(bio1, bio2)
biojupies = merge(bio12, bio3)

In [None]:
#Merge biojupies and geneshot dictionaries
bio_geneshot = merge(biojupies, all_geneshot)
#Merge biojupies, geneshot, and microarray dictionaries
biogsmicro_data = merge(bio_geneshot, micro_data)

In [None]:
#Parse gene list from Harmonizome's json of GO Biological Process Annotation for autophagy
url1 = 'https://amp.pharm.mssm.edu/Harmonizome/api/1.0/gene_set/autophagy/GO+Biological+Process+Annotations'
page1 = re.get(url1)
soup1 = str(BeautifulSoup(page1.text, 'html.parser'))
data1 = json.loads(soup1)
list1 = data1['associations']
gene_list1 = []
for i in range(1, (len(list1))):
    dict1a = list1[i-1]
    dict2a = list1[i]
    dict3a = merge(dict1a, dict2a)
    gene_list1.append(dict3a['gene']['symbol'])
gene_dict1 = {}
gene_dict1[str(['Gene Ontology', 'nan', 'nan'])] = gene_list1

#Parse gene list from Harmonizome's json of Kegg Pathways for autophagy
url2 = 'https://amp.pharm.mssm.edu/Harmonizome/api/1.0/gene_set/regulation+of+autophagy/KEGG+Pathways'
page2 = re.get(url2)
soup2 = str(BeautifulSoup(page2.text, 'html.parser'))
data2 = json.loads(soup2)
list2 = data2['associations']
gene_list2 = []
for i in range(1, (len(list2))):
    dict1b = list2[i-1]
    dict2b = list2[i]
    dict3b = merge(dict1b, dict2b)
    gene_list2.append(dict3b['gene']['symbol'])
gene_dict2 = {}
gene_dict2[str(['KEGG Pathways', 'nan', 'nan'])] = gene_list2

#Parse gene list from Harmonizome's json of Wikipathways for autophagy
url3 = 'https://amp.pharm.mssm.edu/Harmonizome/api/1.0/gene_set/Senescence+and+Autophagy%28Homo+sapiens%29/Wikipathways+Pathways'
page3 = re.get(url3)
soup3 = str(BeautifulSoup(page3.text, 'html.parser'))
data3 = json.loads(soup3)
list3 = data3['associations']
gene_list3 = []
for i in range(1, (len(list3))):
    dict1c = list3[i-1]
    dict2c = list3[i]
    dict3c = merge(dict1c, dict2c)
    gene_list3.append(dict3c['gene']['symbol'])
gene_dict3 = {}
gene_dict3[str(['Wikipathways', 'nan', 'nan'])] = gene_list3

#Merge Harmonizome dictionaries
gene_dict12 = merge(gene_dict1, gene_dict2)
lit_dict = merge(gene_dict12, gene_dict3)

In [None]:
#Upload CREEDS-generated up/down gene lists from GSE41018
rapamycin = pd.read_csv(WORKDIR + '/creeds_rapamycin.tsv', sep='\t')
rapamycin_up = rapamycin[rapamycin['Up/Down'] == 'up']
rapamycin_dn = rapamycin[rapamycin['Up/Down'] != 'up']

#Make rapamycin dictionaries
rapamycin_up_list = list(rapamycin_up['Gene'])
rapamycin_dict_up = {}
rapamycin_dict_up[str(['CREEDS Rapamycin Search', 'nan', 'up'])] = rapamycin_up_list
rapamycin_dn_list = list(rapamycin_dn['Gene'])
rapamycin_dict_dn = {}
rapamycin_dict_dn[str(['CREEDS Rapamycin Search', 'nan', 'dn'])] = rapamycin_dn_list
#Merge rapamycin dictionaries
rapamycin_dict = merge(rapamycin_dict_up, rapamycin_dict_dn)

In [None]:
#Merge biojupies, microarray, geneshot, and Harmonizome dictionaries
almost_all_data = merge(biogsmicro_data, lit_dict)

#Merge biojupies, microarray, geneshot, Harmonizome, and Rapamycin dictionaries
all_data = merge(almost_all_data, rapamycin_dict)

#Extract labels from dictionaries
data_terms = [k for k,v in all_data.items()]
data_vals = [all_data[k] for k in data_terms]

In [None]:
#Preliminary dataframe of data
all_datadf = pd.DataFrame.from_dict(all_data, orient = 'index')
all_datadf.reset_index(inplace=True)

In [None]:
#Make new labels for gmt file
gmt_terms = []
for term in data_terms:
    accession = term.split('[')[1].split(', ')[0].replace("'",'')
    effect = term.split('[')[1].split(', ')[1].replace("'",'')
    updown = term.split(', ')[2].split(']')[0].replace("'",'')
#Customize which categories of metadata to include on the labels
    gmt_terms.append(list([str(accession), str(effect), str(updown)]))

In [None]:
#Make dictionary of desired metadata to merge with preliminary data df
gmt_df = pd.DataFrame(gmt_terms, columns=["ID", "Effect", "Up/Down"])

In [None]:
#Concatenate dataframes, adding metadata and setting ID as index
main_df = pd.concat([gmt_df, all_datadf], axis=1)
main_df = main_df.drop('index', axis=1)
main_df = main_df.set_index('ID')

In [None]:
#Send dataframe to tsv file
all_autophagy_file = (WORKDIR + '/autophagy_data.gmt')
main_df.to_csv(all_autophagy_file, sep = '\t')