# random_forest_ancestor_lab.ipynb
Predict the ancestor lab from sequence and metadata.

This code is RAM intensive. Start an AWS instance with sufficient resources (we recommend r5.metal or m5.24xlarge for shortest runtime - the code takes advantage of all the available CPU cores) with Ubuntu Server 18.04 LTS AMI (for example, ami-0f65671a86f061fcd). 

In [3]:
import pandas as pd
import numpy as np
import sklearn.preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import time
from sklearn.externals import joblib
%pylab inline

import pickle

Populating the interactive namespace from numpy and matplotlib


In [1]:
path = '../../../data/tts'

In [2]:
X_train_raw = pd.read_pickle(os.path.join(path, 'x_train_SO_fixed.pkl'))
X_test_raw = pd.read_pickle(os.path.join(path, 'x_test_SO_fixed.pkl'))
X_val_raw = pd.read_pickle(os.path.join(path, 'x_val_SO_fixed.pkl'))

y_train = pd.read_pickle(os.path.join(path, 'y_train_SO_fixed.pkl'))
#y_test = pd.read_pickle(os.path.join(path, 'y_test_SO_fixed.pkl'))
y_val = pd.read_pickle(os.path.join(path, 'y_val_SO_fixed.pkl'))

NameError: name 'pd' is not defined

In [4]:
trainmeta = X_train_raw.loc[:,np.setdiff1d(X_train_raw.columns,['sequence'])].values
valmeta = X_val_raw.loc[:,np.setdiff1d(X_val_raw.columns,['sequence'])].values
testmeta = X_test_raw.loc[:,np.setdiff1d(X_test_raw.columns,['sequence'])].values

ngram_start, ngram_end = 1, 4
tfidf_14 = TfidfVectorizer(analyzer='char', ngram_range=(ngram_start, ngram_end))

trainseq_tfidf_14 = tfidf_14.fit_transform(X_train_raw.sequence)

valseq_tfidf_14 = tfidf_14.transform(X_val_raw.sequence)
testseq_tfidf_14 = tfidf_14.transform(X_test_raw.sequence)

X_train_14 = np.concatenate([trainseq_tfidf_14.todense(), trainmeta], axis=1)
X_val_14 = np.concatenate([valseq_tfidf_14.todense(), valmeta], axis=1)
X_test_14 = np.concatenate([testseq_tfidf_14.todense(), testmeta], axis=1)

In [5]:
X_train_seqonly_14 = trainseq_tfidf_14.todense()
X_val_seqonly_14 = valseq_tfidf_14.todense()
X_test_seqonly_14 = testseq_tfidf_14.todense()

In [21]:
configurations = {
    'seq_meta': {
        'train_X': X_train_14,
        'train_y': y_train,
        'val_X': X_val_14,
        'val_y': y_val,
        'test_X': X_test_14,
        #'test_y': y_test
    },
    'seqonly':{
        'train_X': X_train_seqonly_14,
        'train_y': y_train,
        'val_X': X_val_seqonly_14,
        'val_y': y_val,
        'test_X': X_test_seqonly_14,
        #'test_y': y_test
    }
}

In [39]:
reasonable_params = {'class_weight': 'balanced',
 'max_features': 0.5,
 'n_estimators': 1000,
 'n_jobs': -1}

In [34]:
models_simple = {}

In [36]:
for key in configurations.keys():

    val = configurations[key]
    models_simple[key] = RandomForestClassifier(**reasonable_params)

    print(key)
    models_simple[key].fit(val['train_X'], val['train_y'])
    print(models_simple[key].score(val['train_X'], val['train_y']))
    print(models_simple[key].score(val['val_X'], val['val_y']))

seq_meta
1.0
0.8830811554332875
seqonly
1.0
0.8262265016047684


In [41]:
for key in configurations.keys():
    model = joblib.dump(models_simple[key], f"model_{key}_second_order_fixed.joblibpkl")
    np.save(f"./predictions_val_second_order_fixed_{key}_simplemodel.npy", models_simple[key].predict_proba(configurations[key]['val_X']))

    np.save(f"./predictions_TEST_second_order_fixed_{key}_simplemodel.npy", models_simple[key].predict_proba(configurations[key]['test_X']))

    np.save(f"./predictions_train_second_order_fixed_{key}_simplemodel.npy", models_simple[key].predict_proba(configurations[key]['train_X']))
    
    np.save(f"./classes_for_predictions_second_order_fixed_{key}_simplemodel.npy", models_simple[key].classes_)

# Lasso sanity check

In [25]:
for key in configurations.keys():
    val = configurations[key]
    print(key)
    model = LogisticRegression(penalty='l1', class_weight='balanced')
    model.fit(val['train_X'], val['train_y'])
    print(model.score(val['train_X'], val['train_y']))
    print(model.score(val['val_X'], val['val_y']))

seq_meta




0.6775007647598654
0.6153140761118753
seqonly




0.29053227286632
0.2640990371389271




In [32]:
pd.Series(y_train).value_counts().iloc[0] / y_train.shape[0]

0.18331293973692261