# Random Forest (Random Forest)

## Import Libraries and Data Loading

In [76]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss, precision_score
from sklearn.model_selection import  GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

train = pd.read_feather('Final_train_dataset.feather')
validation = pd.read_feather('Final_validation_dataset.feather')
test = pd.read_feather('Final_test_dataset.feather')

## Split Data into X(Feature) and Y(Class)

In [77]:
X_train = train.drop(['is_duplicate'],  axis=1)
y_train = train['is_duplicate']
X_val = validation.drop(['is_duplicate'],  axis=1)
y_val = validation['is_duplicate']
X_test = test.drop(['is_duplicate'],  axis=1)
y_test = test['is_duplicate']

## Drop Unecessary Columns (Non Important Features) 

In [78]:
impt_feat = ['freq_q2', 'freq_q1+q2', 'freq_q1', 'jaccard_dist', 'Levenshtein',
       'diff_tfidf_L2', 'diff_tfidf_L1', 'common_ratio', 'levenshtein',
       'fuzz_qratio', 'dist_canberra', 'length_diff', 'lc_substring',
       'lc_subsequence', 'freq_q1-q2', 'same_ending', 'wmdistance',
       'dist_cosine', 'dist_cityblock', 'dist_euclidean', 'dist_minkowski',
       'q1_vec_0', 'q1_vec_1', 'q1_vec_2', 'q1_vec_3', 'q1_vec_4', 'q2_vec_0',
       'q2_vec_1', 'q2_vec_2', 'q2_vec_3', 'q2_vec_4', 'diff_tfidf_L2_norm', 'diff_tfidf_L1_norm',
       'q2_word_to_vec', 'total_length']

X_train = X_train[impt_feat]
X_val = X_val[impt_feat]
X_test = X_test[impt_feat]

## Scale Data

In [79]:
sc = StandardScaler()
sc.fit(X_train)
scale = X_train.columns.tolist()

X_train_scaled = X_train.copy()
X_val_scaled = X_val.copy()
X_test_scaled = X_test.copy()
X_train_scaled[scale] = StandardScaler().fit_transform(X_train[scale])
X_val_scaled[scale] = StandardScaler().fit_transform(X_val_scaled[scale])
X_test_scaled[scale] = StandardScaler().fit_transform(X_test_scaled[scale])

## 1. Building Base Random Forest

In [5]:
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train_scaled, y_train)

In [7]:
def evaluate(model, x_train, y_train, x_val, y_val, x_test, y_test):
    preds_train = model.predict(x_train)
    preds_prob_train = model.predict_proba(x_train)
    preds_val = model.predict(x_val)
    preds_prob_val = model.predict_proba(x_val)
    preds_test = model.predict(x_test)
    preds_prob_test = model.predict_proba(x_test)
    print("The train log loss is:", log_loss(y_train, preds_prob_train))
    print("The train precision is:", precision_score(y_train, preds_train))
    print("The validation log loss is:", log_loss(y_val, preds_prob_val))
    print("The validation precision is:", precision_score(y_val, preds_val))
    print("The test log loss is:", log_loss(y_test, preds_prob_test))
    print("The test precision is:", precision_score(y_test, preds_test))
    preds_train, preds_prob_train, preds_val, preds_prob_val, preds_test, preds_prob_test
    
evaluate(rf, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test)

The train log loss is: 0.09985429587815921
The train precision is: 1.0
The validation log loss is: 0.3644720851956154
The validation precision is: 0.793345137966652
The test log loss is: 0.36893466349485843
The test precision is: 0.7825551044083526


We can tell from the result that the Random Forest is overfit, thus we need to
1. Reduce tree depth
2. Reduce the number of variables sampled at each split
3. Peform Cross Validation

## 2. Prevent Overfitting by Reducing tree depth, Reduce the number of variables sampled at each split & Peform Cross Validation

## 3. Also, Increase n_estimator to increase the predictive power

In [80]:
tree_para = {
    'criterion':['gini', 'entropy'],
    #Reduce Tree depth by setting a max value [Default = No Max Depth]
    'max_depth':[15, 30, 50], 
    'min_samples_leaf':[3, 5, 10],
    #Reduce number of variable sampled [Default = 323429]
    'max_samples' : [15000, 25000, 32000]
}
#Add Cross Validation 
rf_exp = GridSearchCV(RandomForestClassifier(n_estimators= 300, random_state=1),  tree_para, cv = 3)
rf_exp.fit(X_train_scaled, y_train)

## 3. Get Best Parameter

In [81]:
rf_exp.best_params_

{'criterion': 'entropy',
 'max_depth': 30,
 'max_samples': 30000,
 'min_samples_leaf': 5}

## 3. Building Final Random Forest Model

In [85]:
rf = RandomForestClassifier(n_estimators= 300, random_state=100, criterion= 'gini', 
                      max_depth = 30, max_samples = 30000, min_samples_leaf = 5)
rf.fit(X_train_scaled, y_train)

In [100]:
preds_train, preds_prob_train, preds_val, preds_prob_val, preds_test, preds_prob_test = evaluate(rf, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test)

The train log loss is: 0.34135318022029026
The train precision is: 0.8209495226609466
The validation log loss is: 0.347860538641686
The validation precision is: 0.7852130881158136
The test log loss is: 0.351236542586315
The test precision is: 0.7777293211222561


## We Managed to reduce the validation log loss from 0.364 to 0.347

## Output predicted probabilities for final model 

In [101]:
train_probs_0 = [x[0] for x in preds_prob_train]
test_probs_0 = [x[0] for x in preds_prob_test]

pd.DataFrame(train_probs_0).to_csv('predictions_full_RF_train.csv.csv', index = False)    
pd.DataFrame(test_probs_0).to_csv('predictions_full_RF_test.csv', index = False)     