# Random Forest - Harris County 2017 Foreclosure Sale

<h2> Looping through GridSearchCV, Cross-validation and PCA for tuning paramenters, model selection and feature selection
</h2>

- Step: 1) Data loading and cleaning
- Step: 2) Formatting for machine learning 
- Step: 3) Cross validation and modeling 
- Step: 4) Test/evaluate 
- Step: 5) Repeate Step: 2-5) 

In [228]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

In [229]:
# Load the data
path1 = ("C:/Users/aath/Dropbox/MAEN/Thankful/Data/fls/FLS_Hist2017_clean.csv")
df = pd.read_csv(path1)

In [230]:
# Check null status of the columns
df.isnull().sum()

rec_num                  0
keymap                1022
sold3rd                  0
tax_id                 439
org_loan_amt           361
mon_org_loan_date      277
year_org_loan_date     279
sale_date                0
est_loan_bal           857
mortgagee               28
bedr_num              1286
prop_val               506
Term                  1005
Trustee                  1
sq_ft                  614
time_sold             3591
trustee_ref_num       4762
open_bid              4359
final_bid             3605
loan_type               43
dtype: int64

In [231]:
# Eliminate few rows with nulls 
df = df[pd.notnull(df['Trustee'])]
df = df[pd.notnull(df['mortgagee'])]
df = df[pd.notnull(df['loan_type'])]
df = df.drop('rec_num', 1)   # This is an arbitrary index and should be removed

# This is the actual sales dates in 2017. This is time-series which we will ignore here
df = df.drop('sale_date', 1) 

In [232]:
# Drop all other columns with many nulls
df=df.dropna(axis='columns')

In [233]:
features_df = df.drop('sold3rd', 1)
labels_df = df['sold3rd']

In [234]:
# Check to see how many unique categories we may need to create
categorical = features_df.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())

mortgagee
854
Trustee
300
loan_type
10


In [235]:
# Set up feature transforming functions

def transform_feature( features_df, column_name ):
    unique_values = set( features_df[column_name].tolist() )
    transformer_dict = {}
    for ii, value in enumerate(unique_values):
        transformer_dict[value] = ii

    def label_map(y):
        return transformer_dict[y]
    features_df[column_name] = features_df[column_name].apply( label_map )
    return features_df


# transformation

names_of_columns_to_transform = ["mortgagee", "Trustee","loan_type"]
for column in names_of_columns_to_transform:
    features_features_df = transform_feature(features_df, column )

print(features_df.columns.values)

['mortgagee' 'Trustee' 'loan_type']


In [236]:
len(y)

5043

In [237]:
X = features_df.as_matrix()
y = labels_df.tolist()

In [238]:
import sklearn.linear_model
import sklearn.cross_validation
import sklearn.tree
import sklearn.ensemble

clf = sklearn.linear_model.LogisticRegression()
score = sklearn.cross_validation.cross_val_score( clf, X, y )
print("Logistic Regression Scores {}".format(score))

clf = sklearn.tree.DecisionTreeClassifier()
score = sklearn.cross_validation.cross_val_score( clf, X, y )
print("Decision Tree Scores {}".format(score))

clf = sklearn.ensemble.RandomForestClassifier()
score = sklearn.cross_validation.cross_val_score( clf, X, y )
print("Random Forest Scores {}".format(score))

Logistic Regression Scores [ 0.86206897  0.8625818   0.8625    ]
Decision Tree Scores [ 0.80796671  0.80844735  0.78869048]
Random Forest Scores [ 0.81747919  0.80844735  0.81309524]


In [239]:
import sklearn.preprocessing

def hot_encoder(df, column_name):
    column = df[column_name].tolist()
    column = np.reshape( column, (len(column), 1) )  ### needs to be an N x 1 numpy array
    enc = sklearn.preprocessing.OneHotEncoder()
    enc.fit( column )
    new_column = enc.transform( column ).toarray()
    column_titles = []
    ### making titles for the new columns, and appending them to dataframe
    for ii in range( len(new_column[0]) ):
        this_column_name = column_name+"_"+str(ii)
        df[this_column_name] = new_column[:,ii]
    return df

In [240]:
print(features_df.columns.values)

['mortgagee' 'Trustee' 'loan_type']


In [241]:
X = features_df.as_matrix()
y = labels_df.tolist()

clf = sklearn.linear_model.LogisticRegression()
score = sklearn.cross_validation.cross_val_score( clf, X, y )
print("Logistic Regression Scores {}".format(score))

clf = sklearn.tree.DecisionTreeClassifier()
score = sklearn.cross_validation.cross_val_score( clf, X, y )
print("Decision Tree Scores {}".format(score))

clf = sklearn.ensemble.RandomForestClassifier()
score = sklearn.cross_validation.cross_val_score( clf, X, y )
print("Random Forest Scores {}".format(score))

Logistic Regression Scores [ 0.86206897  0.8625818   0.8625    ]
Decision Tree Scores [ 0.80915577  0.80963712  0.78511905]
Random Forest Scores [ 0.81331748  0.82331945  0.83452381]


Above we tested three different calssifiers and all of them preform very similar to each others. However, all of these algorithsm have adjustable variables and we just used the default ones. 

Let's focus on the Random Forest model and see if we can fine tune its parameters to get better results. We can eaither manually adust them or use GridSearchCV tool. This tool will exhaustively search over specificed parameter value and reprot the best ones. However, we should do this search using our best features. And for that we can use SelectKBest tool helping to rank features based on lowest p-values.

You may realize that there are many moving parts in this workflow and yet another tool that can help to combine these together is pipeline package. Pipeline chains the transformation step of SelectKBest with the estimation step of RandomForestClassifier into a coherent workflow.

In [245]:
import sklearn.pipeline

select = sklearn.feature_selection.SelectKBest(k='all')
clf = sklearn.ensemble.RandomForestClassifier()

steps = [('feature_selection', select),
         ('random_forest', clf)]

pipeline = sklearn.pipeline.Pipeline(steps)

X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=0.33, random_state=42)

# fit pipeline on X_train and y_train
pipeline.fit( X_train, y_train )

# call pipeline.predict() on X_test data to make a set of test predictions
y_prediction = pipeline.predict( X_test )

# test predictions using sklearn.classification_report()
report = sklearn.metrics.classification_report( y_test, y_prediction )

# and print the report
print(report)

             precision    recall  f1-score   support

          0       0.86      0.96      0.91      1431
          1       0.22      0.06      0.09       234

avg / total       0.77      0.84      0.80      1665



In [283]:
import sklearn.grid_search

k_range = [i+1 for i in range(3)]         # Number of features selected
n_range = [i+1 for i in range(20)]        # The number of trees in the forest
split_range = [i+2 for i in range(3)]     # The minimum number of samples required to split a node

parameters = dict(feature_selection__k=k_range,  
              random_forest__n_estimators=n_range,
              random_forest__min_samples_split= split_range)

cv = sklearn.grid_search.GridSearchCV(pipeline, param_grid=parameters)

print(pipeline.named_steps)

cv.fit(X_train, y_train)

y_predictions = cv.predict(X_test)

report = sklearn.metrics.classification_report( y_test, y_predictions )

# and print the report
print(report)

{'feature_selection': SelectKBest(k='all', score_func=<function f_classif at 0x000000000CB9A048>), 'random_forest': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)}
             precision    recall  f1-score   support

          0       0.86      1.00      0.92      1431
          1       0.00      0.00      0.00       234

avg / total       0.74      0.86      0.79      1665



  'precision', 'predicted', average, warn_for)
