# Random Forest Classifier and Logistic Regression

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score

In [6]:
df = pd.read_pickle("train_df_w_embeddings.pickle")



In [7]:
df["joint_embedding"] = df["joint_embedding"].apply(np.array)

In [8]:
data_dict = {}
for i, arr in enumerate(df["joint_embedding"].tolist()):
    data_dict[i] = list(arr[0])


In [9]:
labeled_embeddings = pd.DataFrame(data=data_dict).T

In [10]:
labeled_embeddings["label"] = df["label"]

In [11]:
labeled_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,591,592,593,594,595,596,597,598,599,label
0,-0.005404,-0.026299,0.001143,-0.001936,-0.024126,0.000055,-0.015453,-0.041293,-0.035775,0.026950,...,0.032924,0.031764,-0.052913,0.017386,0.001153,-0.003261,0.102025,0.014090,-0.009801,unrelated
1,0.022619,0.034481,-0.014462,-0.031792,0.005484,-0.014029,0.019104,-0.028800,0.039970,-0.048728,...,-0.036017,0.042316,-0.058789,0.036853,-0.010046,0.014047,0.013446,0.016711,0.024217,unrelated
2,0.022619,0.034481,-0.014462,-0.031792,0.005484,-0.014029,0.019104,-0.028800,0.039970,-0.048728,...,0.042132,-0.018738,0.045009,-0.074184,-0.059535,-0.012626,0.063538,-0.042629,0.018659,unrelated
3,0.022619,0.034481,-0.014462,-0.031792,0.005484,-0.014029,0.019104,-0.028800,0.039970,-0.048728,...,-0.022351,-0.068774,0.045776,0.050659,-0.081537,0.001855,-0.066980,0.035670,-0.066461,unrelated
4,0.022619,0.034481,-0.014462,-0.031792,0.005484,-0.014029,0.019104,-0.028800,0.039970,-0.048728,...,0.006565,-0.003706,-0.025230,0.075677,-0.011336,0.089016,-0.052402,0.013970,-0.003769,unrelated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256437,0.034863,-0.059326,0.049707,-0.070744,-0.001120,-0.058087,-0.142871,-0.022375,-0.008919,-0.006049,...,0.048136,0.051450,0.006088,0.026855,-0.026688,0.014771,0.022054,-0.092753,0.035319,unrelated
256438,0.034863,-0.059326,0.049707,-0.070744,-0.001120,-0.058087,-0.142871,-0.022375,-0.008919,-0.006049,...,-0.069458,-0.052176,0.118120,-0.010951,0.067435,-0.031948,0.025286,-0.006662,0.009626,unrelated
256439,0.034863,-0.059326,0.049707,-0.070744,-0.001120,-0.058087,-0.142871,-0.022375,-0.008919,-0.006049,...,0.113579,-0.003357,-0.014145,-0.014359,-0.003601,-0.006561,-0.026197,0.038269,-0.029663,unrelated
256440,0.034863,-0.059326,0.049707,-0.070744,-0.001120,-0.058087,-0.142871,-0.022375,-0.008919,-0.006049,...,-0.127466,-0.124268,-0.016895,0.047437,-0.040942,-0.031863,0.031812,-0.050000,0.003687,unrelated


In [12]:
labeled_embeddings.to_pickle("labeled_embeddings.pickle")

In [14]:
labeled_embeddings.dropna(inplace=True)

In [16]:
def evaluate(clf, labeled_embeddings):
    X_train, X_test, y_train, y_test = train_test_split(labeled_embeddings.drop("label", axis=1), 
                                                        labeled_embeddings["label"], 
                                                        test_size=0.33, 
                                                        random_state=42)

    clf.fit(X_train, y_train)
    results = clf.predict(X_test)
    print(classification_report(y_test, results))
    print(accuracy_score(y_test, results))
    return clf

In [None]:
print(accuracy_score(y_test, results))


In [None]:
RF_1 = RandomForestClassifier(max_depth=300)
RF_1 = evaluate(RF_1, labeled_embeddings)

In [None]:
LR = LogisticRegression()
LR = evaluate(LR, labeled_embeddings)