In [1]:
import json
import functools
import os
import re
import numpy as np
import pandas
from datetime import datetime
from glob import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils._joblib import Parallel, delayed, effective_n_jobs

In [2]:
fileMap = {}
for i in glob("data/v8-feature-matrices/*.tsv"):
    if "5FOLD" not in i:
        name = os.path.basename(i).split("_")[0]
        fileMap[name] = i

In [3]:
crossMap = {}
for i in glob("data/v8-cv-matrices/*.tsv"):
    if "5FOLD" in i:
        name = os.path.basename(i).split("_")[0]
        crossMap[name] = i

In [None]:
featureSelection = {}

df = pandas.read_csv("data/featuresets_v8.tsv", sep="\t")
df["TCGA_Projects"] = df["TCGA_Projects"].apply(json.loads)
df["Features"] = df["Features"].apply(json.loads)
df = pandas.DataFrame({'Feature_Set_ID': np.repeat(df.Feature_Set_ID.values, df.TCGA_Projects.str.len()),
                       'TCGA_Project': np.concatenate(df.TCGA_Projects.values),
                       'Features': np.repeat(df.Features.values, df.TCGA_Projects.str.len())})

for index, row in df.iterrows():
    cancer = row['TCGA_Project']
    if cancer not in featureSelection:
        featureSelection[cancer] = {}
    fset = row['Feature_Set_ID']
    if fset not in featureSelection[cancer]:
        featureSelection[cancer][fset] = row['Features']

In [19]:
def dropna(v):
    return list(filter(lambda x:x==x, v))

In [27]:
def runRandomForest(cancer, fset, seed = 42, n_estimators = 500):
    model_id = "%s|%s|%s|c" % ("%s:RandomForest(n_estimators=%s, random_state=%s)" % (cancer, n_estimators, seed), 
                               fset, 
                               datetime.now(tz=None).isoformat())
    print("Running %s" % (model_id))
    
    matrix = pandas.read_csv(fileMap[cancer], sep="\t", index_col=0)
    
    try:
        X = matrix[dropna(featureSelection[cancer][fset])]
        y = matrix.Labels
    except Exception as e:
        print("Exception:", e)
        return pandas.DataFrame()
    
    if len(X.columns) == 0:
        return pandas.DataFrame()
    
    crossf = pandas.read_csv(crossMap[cancer], delimiter="\t", index_col=0)
    repeat_folds = list(crossf.columns)
    repeat_folds.pop(0) # remove entry called "Labels"

    preds = pandas.DataFrame()
    for ct in repeat_folds:
        X_test = X[crossf[ct] == 1]
        y_test = y[crossf[ct] == 1]
        X_train = X[crossf[ct] == 0]
        y_train = y[crossf[ct] == 0]

        clf = RandomForestClassifier(n_estimators=500, random_state = seed)
        try:
            clf.fit(X_train, y_train)
        except Exception as e:
            print("Exception:", e)
            continue
        train_pred = clf.predict(X_train)
        test_pred = clf.predict(X_test)
        test = pandas.DataFrame({"Sample_ID": X_test.index, 
                                 "Repeat": int(ct.split(":")[0][1:]),
                                 "Fold": int(ct.split(":")[1][1:]) , 
                                 "Test": 1, 
                                 "Label": ["%s:%s" % (cancer, y) for y in y_test], 
                                 model_id: ["%s:%s" % (cancer, y) for y in test_pred]})
        train = pandas.DataFrame({"Sample_ID": X_train.index, 
                                  "Repeat": int(ct.split(":")[0][1:]),
                                  "Fold": int(ct.split(":")[1][1:]) , 
                                  "Test": 0, 
                                  "Label": ["%s:%s" % (cancer, y) for y in y_train], 
                                  model_id: ["%s:%s" % (cancer, y) for y in train_pred]})
        preds = preds.append(test).append(train)
        
    return preds

In [28]:
results = {}

In [None]:
sFunc = delayed(runRandomForest)
parallel = Parallel(n_jobs=10)
for cancer in featureSelection.keys():
    if cancer in results:
        continue
    f = []
    for fset in featureSelection[cancer].keys():
        f.append( sFunc(cancer, fset) )
    print("running jobs for: %s" % cancer)
    try:
        o = parallel(f)
    except:
        o = Prallel(n_jobs=10)(f)
    results[cancer] = o

running jobs for: LIHCCHOL
running jobs for: CESC
running jobs for: LGGGBM


In [25]:
# writing prediction output files
for c in results.keys():
    cpred = functools.reduce(pandas.merge, [df for df in results[c] if df.shape[0] > 0])
    cpred.to_csv("./outputs/%s_randomforest_v8_struck.tsv" % c, index = False, sep = "\t")