In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

from imblearn.over_sampling import SMOTE

In [2]:
#load Hugo's RF model
rf_pkl = open('../hugo/Random_Forest_Model.pkl', 'rb')
rf_pipeline = pickle.load(rf_pkl);
#extract the RF model from the Pipeline object
rf_model = rf_pipeline.steps[1][1];

In [3]:
rf_trees = rf_model.estimators_;
#count top feature for Random Forest model
rf_top_features = {};
for tree in rf_trees:
    for i in range(3):
        top_feat = tree.tree_.feature[i];
        try:
            rf_top_features[top_feat] += 1;
        except:
            rf_top_features[top_feat] = 1;
        
print("Random Forest Top Features", rf_top_features);
print(len(rf_top_features.keys()), "top features in random forest")

Random Forest Top Features {269: 3, 761: 3, 1: 11, 109: 5, 193: 1, 88: 12, 65: 1, 271: 5, 272: 1, 194: 5, 90: 8, 844: 2, 192: 1, 348: 2, 5: 11, 115: 4, 198: 9, 21: 2, 112: 5, 273: 6, 46: 1, 64: 2, 68: 4, 94: 2, 27: 2, 215: 5, 254: 6, 235: 4, 887: 1, 45: 5, 737: 1, 277: 3, 256: 6, 49: 2, 92: 9, 977: 1, 881: 1, 120: 13, 71: 3, 692: 1, 110: 2, 943: 1, 898: 2, 734: 2, 149: 1, 105: 4, 689: 5, 762: 2, 267: 3, 234: 4, 690: 2, 153: 5, 157: 1, 276: 3, 174: 1, 24: 6, 95: 1, 25: 4, 154: 1, 282: 6, 824: 7, 806: 1, 29: 1, 274: 2, 275: 2, 283: 1, 3: 2, 238: 2, 47: 1, 231: 2, 134: 1, 960: 1, 195: 1, 11: 1, 236: 1, 108: 2, 784: 1, 769: 1, 70: 1, 833: 2, 787: 2, 742: 1, 107: 2, 175: 6, 191: 1, 91: 2, 886: 1, 119: 2, 683: 1, 113: 2, 132: 1, 2: 1, 114: 1, 72: 1, 156: 1, 73: 2, 834: 1, 852: 1, 786: 2, 237: 1, 111: 1, 131: 1, 116: 1, 903: 2, 783: 1, 48: 1, 173: 1, 214: 1, 741: 1}
109 top features in random forest


In [4]:
top_feats_sorted = sorted(rf_top_features.items(), key= lambda x: x[1], reverse=True);
print(top_feats_sorted)

[(120, 13), (88, 12), (1, 11), (5, 11), (198, 9), (92, 9), (90, 8), (824, 7), (273, 6), (254, 6), (256, 6), (24, 6), (282, 6), (175, 6), (109, 5), (271, 5), (194, 5), (112, 5), (215, 5), (45, 5), (689, 5), (153, 5), (115, 4), (68, 4), (235, 4), (105, 4), (234, 4), (25, 4), (269, 3), (761, 3), (277, 3), (71, 3), (267, 3), (276, 3), (844, 2), (348, 2), (21, 2), (64, 2), (94, 2), (27, 2), (49, 2), (110, 2), (898, 2), (734, 2), (762, 2), (690, 2), (274, 2), (275, 2), (3, 2), (238, 2), (231, 2), (108, 2), (833, 2), (787, 2), (107, 2), (91, 2), (119, 2), (113, 2), (73, 2), (786, 2), (903, 2), (193, 1), (65, 1), (272, 1), (192, 1), (46, 1), (887, 1), (737, 1), (977, 1), (881, 1), (692, 1), (943, 1), (149, 1), (157, 1), (174, 1), (95, 1), (154, 1), (806, 1), (29, 1), (283, 1), (47, 1), (134, 1), (960, 1), (195, 1), (11, 1), (236, 1), (784, 1), (769, 1), (70, 1), (742, 1), (191, 1), (886, 1), (683, 1), (132, 1), (2, 1), (114, 1), (72, 1), (156, 1), (834, 1), (852, 1), (237, 1), (111, 1), (131, 

In [5]:
#create dataframe that Hugo trained the model on again
df = pd.read_csv("../hugo/cleaned_2014", low_memory = False);
data_train, data_test = train_test_split(df, test_size=.2, stratify=df.paid, random_state=1)
def split_data(df, cols):
    x = df.drop(cols, axis = 1)
    y = df.paid
    return x, y

cols_to_drop_training = ['loan_status', 'paid', 'amnt', 'total_pymnt', 'term']
x_train_initial, y_train_initial = split_data(data_train, cols_to_drop_training)

In [6]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False);
poly.fit(x_train_initial);
all_training_features = poly.get_feature_names(input_features= x_train_initial.columns);

In [7]:
print("Training Features: {0}".format(len(all_training_features)))

for i in range(10):
    feat_num = top_feats_sorted[i][0];
    print("Feature {0}: {1}".format(feat_num, all_training_features[feat_num]))


Training Features: 989
Feature 120: int_rate purpose_major_purchase
Feature 88: int_rate dti
Feature 1: int_rate
Feature 5: emp_length_1 year
Feature 198: dti purpose_debt_consolidation
Feature 92: int_rate emp_length_2 years
Feature 90: int_rate emp_length_1 year
Feature 824: grade_C purpose_credit_card
Feature 273: emp_length_1 year purpose_car
Feature 254: emp_length_1 year emp_length_6 years
