In [1]:
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

import utils

%matplotlib inline

In [2]:
df = pd.read_csv('data/processed_train.csv')
df_test = pd.read_csv('data/processed_test.csv')

In [3]:
CALC_COLUMNS = [col for col in df.columns if 'calc' in col]

df.drop(CALC_COLUMNS, axis=1, inplace=True)
df_test.drop(CALC_COLUMNS, axis=1, inplace=True)
len(df.columns)

39

In [4]:
CATEGORICAL_COLUMNS = [col for col in df.columns if col.endswith('bin') or col.endswith('cat')]
NUMERIC_COLUMNS = [col for col in df.columns if not col.endswith('bin') and not col.endswith('cat')]

NUMERIC_COLUMNS.remove('id')
NUMERIC_COLUMNS.remove('target')

In [5]:
utils.cast_save_memmory(df, NUMERIC_COLUMNS, CATEGORICAL_COLUMNS)
utils.cast_save_memmory(df_test, NUMERIC_COLUMNS, CATEGORICAL_COLUMNS)

In [6]:
clf = RandomForestClassifier(n_estimators=100,
                             n_jobs=-1,
                             verbose=0,
                             class_weight='balanced',
                             random_state=0
                            )

param_distributions = {'max_depth': list(range(6, 15, 3)),
                       'min_samples_leaf': [100, 200, 300]}

grid_clf = GridSearchCV(clf, param_distributions, scoring='roc_auc', verbose=0)
grid_clf.fit(df[NUMERIC_COLUMNS + CATEGORICAL_COLUMNS], df['target'])

GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced',
                                              n_jobs=-1, random_state=0),
             param_grid={'max_depth': [6, 9, 12],
                         'min_samples_leaf': [100, 200, 300]},
             scoring='roc_auc')

In [7]:
grid_clf.best_estimator_

RandomForestClassifier(class_weight='balanced', max_depth=12,
                       min_samples_leaf=300, n_jobs=-1, random_state=0)

In [8]:
clf = RandomForestClassifier(n_estimators=200,
                             max_depth=grid_clf.best_estimator_.max_depth,
                             min_samples_leaf=grid_clf.best_estimator_.min_samples_leaf,
                             n_jobs=-1,
                             verbose=0,
                             class_weight='balanced',
                             random_state=0
                            )

clf.fit(df[NUMERIC_COLUMNS + CATEGORICAL_COLUMNS].values, df['target'].values)

y_pred = clf.predict_proba(df_test[NUMERIC_COLUMNS + CATEGORICAL_COLUMNS].values)

In [10]:
sol = pd.read_csv('data/sample_submission.csv')

In [11]:
sol.iloc[:, 1] = y_pred[:, 1]

In [12]:
sol.to_csv('solutions/rf_baseline.csv', index=False)

In [15]:
pd.DataFrame({'features': NUMERIC_COLUMNS + CATEGORICAL_COLUMNS, 'importance': clf.feature_importances_})

Unnamed: 0,features,importance
0,ps_ind_01,0.02387
1,ps_ind_03,0.047211
2,ps_ind_14,7.3e-05
3,ps_ind_15,0.045233
4,ps_reg_01,0.036528
5,ps_reg_02,0.051924
6,ps_reg_03,0.096228
7,ps_car_11,0.010519
8,ps_car_12,0.03146
9,ps_car_13,0.149384
