
# Classification with Random Forrest

In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from utils import *

In [2]:
preliminary_variables = ["disposition", "fine_amount","discount_amount", "judgment_amount", "compliance"]

In [3]:
df = pd.read_csv("train.csv",index_col="ticket_id", encoding = "ISO-8859-1",low_memory=False) 
df1= df[ preliminary_variables ].copy()
df1.dropna(inplace=True)

In [4]:
df_geoloc = df_geog_info(verbose=True)
df1 = df1.join(df_geoloc,how="left")
df1.dropna(inplace=True)

rows in df with addresses and ticketsID: 311307
rows after joining by address with latlon df: 266319
rows with merged info (lat,lon, address) after Nas dropped: 266316


In [5]:
#scalers_lat_lon = scale_lat_lon(df1)
#df1.drop(columns=["lat","lon"],inplace=True)

In [6]:
df1 = generate_street_zipcode(df1)
df1 = encode_categorical(df1)

Variable zip_code:
- 17666 unique values
- max. binary representation: 0b100010100000010
- binary-encoded generated features: 16

Variable street:
- 1696 unique values
- max. binary representation: 0b11010100000
- binary-encoded generated features: 12

Variable address:
- 64932 unique values
- max. binary representation: 0b1111110110100100
- binary-encoded generated features: 17



In [7]:
target = 'compliance'
features = df1.columns.drop(target)
X_train, X_test, y_train, y_test = train_test_split(df1[features], df1[target], random_state=0,train_size=0.85)



## Dataset with final features:

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117399 entries, 107479 to 116671
Data columns (total 54 columns):
fine_amount                     117399 non-null float64
discount_amount                 117399 non-null float64
judgment_amount                 117399 non-null float64
lat                             117399 non-null float64
lon                             117399 non-null float64
address_0                       117399 non-null int64
address_1                       117399 non-null int64
address_2                       117399 non-null int64
address_3                       117399 non-null int64
address_4                       117399 non-null int64
address_5                       117399 non-null int64
address_6                       117399 non-null int64
address_7                       117399 non-null int64
address_8                       117399 non-null int64
address_9                       117399 non-null int64
address_10                      117399 non-null int64
address_11

# Model development

In [9]:
#dictionary of class weights
class_weight = {}
for i in y_train.unique():
    class_weight[i] = np.ceil(100* np.sum(y_train==i) /len(y_train))
class_weight

{0.0: 94.0, 1.0: 7.0}

### A first try ....

In [11]:
%%time
#bootstrap true ensures that each subsample has the same size as the trainning set allowing replacement 
rf_clf = RandomForestClassifier(n_estimators=5, bootstrap=True, max_depth=3,min_samples_split=10,min_samples_leaf=5,
                      n_jobs=-1,random_state=0,class_weight=class_weight)
rf_clf.fit(X_train,y_train)
#y_train_pred = rf_clf.predict(X_train)y_test_pred = rf_clf.predict(X_test)
print("Train AUC score: {}".format(roc_auc_score(rf_clf.predict(X_train),y_train)))
print("Test AUC score: {}".format(roc_auc_score(rf_clf.predict(X_test),y_test)))

Train AUC score: 0.9662049199904466
Test AUC score: 0.9650331061814316
Wall time: 470 ms


In [12]:
print("Feature Importances:")
feature_importances= dict(zip(rf_clf.feature_importances_,X_train.columns.to_numpy()))
feature_importances[0.0] = "All the rest not shown"
#feature_importances= dict(zip(X_train.columns.to_numpy(),rf_clf.feature_importances_))
feature_importances

Feature Importances:


{0.0: 'All the rest not shown',
 2.7105446622880483e-05: 'lat',
 4.747123810110245e-05: 'zip_code_2',
 6.687983602707283e-05: 'address_1',
 8.896874356588162e-05: 'zip_code_11',
 0.0002201403500001104: 'street_5',
 0.0006091420748572001: 'street_1',
 0.0010044686410029988: 'judgment_amount',
 0.0010469154556846543: 'address_8',
 0.0027649649185774302: 'address_3',
 0.003498762082919376: 'zip_code_10',
 0.011409804192552862: 'lon',
 0.0701118824527955: 'responsible_by_determination',
 0.14937730153806852: 'fine_amount',
 0.1630805340162775: 'responsible_by_deter',
 0.2266669271081013: 'responsible_by_default',
 0.36997873190484565: 'discount_amount'}

In [14]:
confusion_mat(y_pred=rf_clf.predict(X_test),y_true=y_test)

Unnamed: 0,Predicted 0,Predicted 1
True 0,19244,0
True 1,1447,27


## Cross-validation

In [15]:
important_features = [ feature_importances[i] for i in feature_importances if i>1e-3 ]
selected_features  = ['fine_amount', 'discount_amount', 'judgment_amount', 'lat', 'lon',
                     'responsible_by_deter', 'responsible_by_admission','responsible_by_default',
                     'responsible_by_determination']

parameters = {"n_estimators":[3,5,7,9,12,15], "max_depth":[2,3,5,7,9] , "min_samples_split":[10,50,100] , "min_samples_leaf": [1,10,50] }
model = RandomForestClassifier(n_jobs=-1,random_state=0,class_weight=class_weight,bootstrap=True)

### with AUC score

In [17]:
grid_clf = train_and_cross_validate(model,X_train[important_features],y_train,parameters,scoring="roc_auc")
print("best  score: {}".format(grid_clf.best_score_))
confusion_mat(y_true=y_test,y_pred=grid_clf.predict(X_test[important_features]))

--- 96.32846307754517 seconds ---
best  score: 0.7942398982901729


Unnamed: 0,Predicted 0,Predicted 1
True 0,19237,7
True 1,1264,210


### with F1 score

In [18]:
grid_clf = train_and_cross_validate(model,X_train[selected_features],y_train,parameters,scoring="f1")
print("best score: {}".format(grid_clf.best_score_))
confusion_mat(y_true=y_test,y_pred=grid_clf.predict(X_test[selected_features]))

--- 112.7369544506073 seconds ---
best score: 0.25624194913023246


Unnamed: 0,Predicted 0,Predicted 1
True 0,19237,7
True 1,1291,183


### with Recall

In [19]:
grid_clf = train_and_cross_validate(model,X_train[selected_features],y_train,parameters,scoring="recall")
print("best score: {}".format(grid_clf.best_score_))
confusion_mat(y_true=y_test,y_pred=grid_clf.predict(X_test[selected_features]))

--- 112.24975204467773 seconds ---
best score: 0.1482624448476719


Unnamed: 0,Predicted 0,Predicted 1
True 0,19237,7
True 1,1291,183


In [22]:
plot_cross_validation_results(parameters,grid_clf)

<IPython.core.display.Javascript object>



## Feature Importance

In [23]:
#sns.set(style='darkgrid',context='talk',palette='Blues_r',font_scale=0.8)
coefficients =  grid_clf.best_estimator_.feature_importances_
sorted_coef_index = coefficients.argsort()
feature_names = np.array(X_train[selected_features].columns)
feature_names = feature_names[ sorted_coef_index ].astype('str')
n = len(feature_names)
plt.figure(figsize=(9,0.38*n))
feat = feature_names[:]
coef = coefficients[sorted_coef_index][:]
ax = sns.barplot(y=feat,x=coef,orient='h',color='Blue')
ax.axes.set_title("Feature Importance (Coefficient)")
ax.tick_params(labelsize=9,which='major',pad=-2)
plt.tight_layout()
#plt.savefig("./report/feature_importance.png",format='png')

<IPython.core.display.Javascript object>