# random_forest_nation_of_origin.ipynb
Predict the nation-of-origin from sequence and metadata.

This code is RAM intensive. Start an AWS instance with sufficient resources (we recommend r5.metal or m5.24xlarge for shortest runtime - the code takes advantage of all the available CPU cores) with Ubuntu Server 18.04 LTS AMI (for example, ami-0f65671a86f061fcd). 

In [8]:
import pandas as pd
import numpy as np
import sklearn.preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import time
from sklearn.externals import joblib

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [9]:
path = '../../../data/tts'

In [10]:
X_train_raw = pd.read_pickle(os.path.join(path, 'x_train_country.pkl'))
X_test_raw = pd.read_pickle(os.path.join(path, 'x_test_country.pkl'))
X_val_raw = pd.read_pickle(os.path.join(path, 'x_val_country.pkl'))

y_train = pd.read_pickle(os.path.join(path, 'y_train_country.pkl'))
#y_test = pd.read_pickle(os.path.join(path, 'y_test_country.pickle'))
y_val = pd.read_pickle(os.path.join(path, 'y_val_country.pkl'))

In [11]:
X_train_raw.shape

(57444, 40)

In [12]:
trainmeta = X_train_raw.loc[:,np.setdiff1d(X_train_raw.columns,['sequence'])].values
valmeta = X_val_raw.loc[:,np.setdiff1d(X_val_raw.columns,['sequence'])].values
testmeta = X_test_raw.loc[:,np.setdiff1d(X_test_raw.columns,['sequence'])].values

ngram_start, ngram_end = 1, 4
tfidf_14 = TfidfVectorizer(analyzer='char', ngram_range=(ngram_start, ngram_end))

trainseq_tfidf_14 = tfidf_14.fit_transform(X_train_raw.sequence)

valseq_tfidf_14 = tfidf_14.transform(X_val_raw.sequence)
testseq_tfidf_14 = tfidf_14.transform(X_test_raw.sequence)

X_train_14 = np.concatenate([trainseq_tfidf_14.todense(), trainmeta], axis=1)
X_val_14 = np.concatenate([valseq_tfidf_14.todense(), valmeta], axis=1)
X_test_14 = np.concatenate([testseq_tfidf_14.todense(), testmeta], axis=1)

In [None]:
reasonable_params = {'class_weight': 'balanced',
 'max_features': 0.5,
 'n_estimators': 1000,
 'n_jobs': -1}

# US vs Non-US

In [102]:
model_selforno = RandomForestClassifier(**reasonable_params)

model_selforno.fit(X_train_14, (y_train == 33.0))

1.0


NameError: name 'n' is not defined

In [104]:
print(model_selforno.score(X_train_14, (y_train == 33.0)))
print(model_selforno.score(X_val_14, (y_val == 33.0)))

1.0
0.8738973246565438


In [109]:
joblib.dump(model_selforno, f"model_selforno.joblibpkl")

['model_selforno.joblibpkl']

In [110]:
np.save("./predictions_val_countries_seq_meta_selforno.npy", model_selforno.predict_proba(X_val_14))

np.save("./predictions_TEST_countries_seq_meta_selforno.npy", model_selforno.predict_proba(X_test_14))

np.save("./predictions_train_countries_seq_meta_selforno.npy", model_selforno.predict_proba(X_train_14))

np.save("./classes_for_predictions_countries_seq_selforno.npy", model_selforno.classes_)

# Outside US by country

In [None]:
model_nous= RandomForestClassifier(**reasonable_params)

model_nous.fit(X_train_14[np.where(y_train != 33.0)], y_train[np.where(y_train != 33.0)])

In [87]:
print(model_nous.score(X_train_14[np.where(y_train != 33.0)], y_train[np.where(y_train != 33.0)]))
print(model_nous.score(X_val_14[np.where(y_val != 33.0)], y_val[np.where(y_val != 33.0)]))

1.0
0.7280909521553766


In [108]:
joblib.dump(model_nous, f"model_nous.joblibpkl")

['model_nous.joblibpkl']

In [106]:
np.save("./predictions_val_countries_seq_meta_nous.npy", model_nous.predict_proba(X_val_14))

np.save("./predictions_TEST_countries_seq_meta_nous.npy", model_nous.predict_proba(X_test_14))

np.save("./predictions_train_countries_seq_meta_nous.npy", model_nous.predict_proba(X_train_14))

np.save("./classes_for_predictions_countries_seq_nous.npy", model_nous.classes_)