In [None]:
#Import required functions
import GEOparse
import pandas as pd
import numpy as np
import os
import json
from sklearn.preprocessing import quantile_transform
from sklearn.decomposition import PCA
import warnings
from scipy.stats import chi2
from scipy.stats.mstats import zscore
import time
#Time sleep to prevent crashes
time.sleep(1)
#Change this to your working directory
os.chdir('../Data')
os.chdir('../Scripts')
from microarray_analysis import *

In [None]:
#Read metadata file
metadata = pd.read_csv('../Data/example_metadata.csv')
metadata

## In the for loop below, please change the labels of your metadata to fit your csv

In [None]:
#Run analysis over studies in the metadata file, printing a list of the up and down genes
up_data = {}
dn_data = {}
for i in range(0,len(metadata.index)):
    accession_id = metadata.iloc[i]['GEO Accession Number']
    control_samples = metadata.iloc[i]['Control Samples']
    treated_samples = metadata.iloc[i]['Treated Samples']
    cell_type = str(metadata.iloc[i]['Cell Type'])
    phys = str(metadata.iloc[i]['Altered Condition'])
    drug = str(metadata.iloc[i]['Drug Name'])
    gene = str(metadata.iloc[i]['Name of the Perturbed Gene'])
    gene_type = str(metadata.iloc[i]['Gene Alteration'])
    platform = str(metadata.iloc[i]['GEO Platform'])
    metadata_list = [platform, cell_type, phys, drug, gene, gene_type]
    control_samples = control_samples.split(',')
    control_samples = [c.strip(' ') for c in control_samples]
    treated_samples = treated_samples.split(',')
    treated_samples = [t.strip(' ') for t in treated_samples]
    DEGs = micro_analysis(accession_id, control_samples, treated_samples)
    up_genes, dn_genes = DEGs
    up_data[accession_id + ' ' + str(metadata_list) + ' up'] = up_genes
    dn_data[accession_id + str(metadata_list) + ' dn'] = dn_genes
    print(up_data, dn_data)

In [None]:
#Define Jaccard Index function from Megan's code
def jaccardIndex(listA,listB):
    intersection = len(set(listA) & set(listB))
    JI = intersection/(len(listA)+len(listB)-intersection)
    return(JI)

In [None]:
#Merged up and down data
all_data = merge(up_data, dn_data)
all_data

In [None]:
#Extract labels from lists of up/down genes
all_terms = [k for k,v in all_data.items()] 

In [None]:
#Set up matrix store the calculations of the Jaccard indices
matrix = []

for term1 in all_terms:
    geneset1 = all_data[term1]
    row = []
    for term2 in all_terms:
        geneset2 = all_data[term2]
        sim_score = jaccardIndex(geneset1,geneset2)
        row.append(sim_score)
    matrix.append(row)       

In [None]:
# Final version of the desired format of  labels
final_terms = []
for term in all_terms:
    accession = term.split('[')[0].strip()
    platform = term.split('[')[1].split(', ')[0].replace("'",'')
    cell_line = term.split('[')[1].split(', ')[1].replace("'",'')
    physical_alt = term.split('[')[1].split(', ')[2].replace("'",'')
    chemical_alt = term.split('[')[1].split(', ')[3].replace("'",'')
    genetic_alt = term.split('[')[1].split(', ')[4].replace("'",'')
    genetic_alt_type = term.split('[')[1].split(', ')[5].split(']')[0].replace("'",'')
    updown = term.split('] ')[1]
#Customize which categories of metadata to include on the labels
    final_terms.append(list(['Accession: ' + str(accession), 'Cell Line: ' + str(cell_line), 'Up/Down: ' + str(updown),
                            'Platform: ' + str(platform), 'Physical Alteration: ' + str(physical_alt),
                            'Chemical Alteration: ' + str(chemical_alt), 'Genetic Alteration: ' + str(genetic_alt) + str(genetic_alt_type)])) 
    

In [None]:
#Visualize labels
final_terms

In [None]:
#Make a tab-delimited file
clustergrammer_output = open('../jaccard_clustergrammer_autophagy.tsv','w')
#Number of categories of metadata used, e.g. Accession ID, Cell Line, or Up/Down
categories = 7
#Shift columns to make room for row labels and write columns
for cat in range(0,categories):
    clustergrammer_output.write('\t'*categories + '\t'.join([x[cat] for x in final_terms]) + '\n')
#Write rows and input jaccard indices   
for i in range(0, len(matrix)):
    clustergrammer_output.write(final_terms[i][0] + '\t' + final_terms[i][1] + '\t' + final_terms[i][2] + '\t' + final_terms[i][3] + '\t'+ final_terms[i][4] + '\t'+ final_terms[i][5] + '\t'+ final_terms[i][6] + '\t' + '\t'.join([str(x) for x in matrix[i]]) + '\n')

clustergrammer_output.close()