In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tnrange, tqdm_notebook
import gc

In [2]:
sns.set_context('talk')

# Read the data

In [3]:
dfXtrain = pd.read_csv('preprocessed_csv/train_more.csv', index_col='id')
dfXtest = pd.read_csv('preprocessed_csv/test_more.csv', index_col='id')
dfYtrain = pd.read_csv('preprocessed_csv/y_train_more.csv', header=None, names=['INDEX', 'P_TARGET_FLAG'])

In [4]:
x_train = np.array(dfXtrain)
x_test = np.array(dfXtest)

y_train = np.array(dfYtrain['P_TARGET_FLAG'])

# Save routines

In [5]:
dfYtest_stacking = pd.DataFrame({'INDEX': dfXtrain.index, 'P_TARGET_FLAG': np.zeros(x_train.shape[0])})
dfYtest_stacking.head()

Unnamed: 0,INDEX,P_TARGET_FLAG
0,1,0.0
1,2,0.0
2,4,0.0
3,5,0.0
4,6,0.0


In [6]:
def save_to_file(y, file_name):
    dfYtest_stacking['P_TARGET_FLAG'] = y
    dfYtest_stacking.to_csv('stacking/{}'.format(file_name), index=False)

# Train RF

In [7]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import train_test_split

In [8]:
out_feature = np.array([  0.,   0.,   0.,   2.,   0.,   4.,   4.,   6.,   8.,   0.,   6.,
         7.,  12.,   6.,   1.,  13.,  15.,   7.,   8.,  16.,  15.,  19.,
         2.,  18.,  12.,  17.,  14.,  10.,   1.,  24.,  18.,  25.,  22.,
        15.,  34.,   9.,  30.,  13.,  32.,   5.,  34.,  17.,  19.,  34.,
        35.,   7.,  33.,  14.,  46.])
out_feature

array([  0.,   0.,   0.,   2.,   0.,   4.,   4.,   6.,   8.,   0.,   6.,
         7.,  12.,   6.,   1.,  13.,  15.,   7.,   8.,  16.,  15.,  19.,
         2.,  18.,  12.,  17.,  14.,  10.,   1.,  24.,  18.,  25.,  22.,
        15.,  34.,   9.,  30.,  13.,  32.,   5.,  34.,  17.,  19.,  34.,
        35.,   7.,  33.,  14.,  46.])

In [9]:
last_out = 35

In [10]:
out_feature.shape[0] - last_out

14

In [11]:
%%time

columns = np.array(dfXtest.columns.tolist())

for features_count in tqdm_notebook(range(x_test.shape[1] - 1, last_out - 1, -1), desc='features count'):
    out = out_feature[features_count]
    columns = np.delete(columns, out)


CPU times: user 35.7 ms, sys: 3.74 ms, total: 39.4 ms
Wall time: 173 ms


In [12]:
set(dfXtest.columns.tolist()) - set(columns)

{'age_nan',
 'car_age',
 'car_type_Sports Car',
 'clm_freq',
 'education_<High School',
 'education_Bachelors',
 'education_Masters',
 'education_PhD',
 'income_ano',
 'income_nan',
 'job_Other',
 'job_z_Blue Collar',
 'yoj',
 'yoj_ano'}

In [13]:
%%time

X = x_train
Xtest = x_test

for features_count in tqdm_notebook(range(x_test.shape[1] - 1, last_out - 1, -1), desc='features count'):
    out = out_feature[features_count]
    X = np.delete(X, out, 1)
    Xtest = np.delete(Xtest, out, 1)



CPU times: user 93.5 ms, sys: 2.85 ms, total: 96.3 ms
Wall time: 93.9 ms


In [16]:
%%time

kwargs = {'n_estimators': 5000, 'criterion': 'entropy', 'n_jobs': -1}

proba = np.zeros(X.shape[0])

for random_state in tqdm_notebook(range(1, 21), desc='random_state'):
    rfc = RFC(random_state=random_state, **kwargs)
    rfc.fit(X, y_train)
    proba += rfc.predict_proba(X)[:, 1]
    del rfc
    gc.collect()


CPU times: user 53min 41s, sys: 23.9 s, total: 54min 5s
Wall time: 16min 49s


In [17]:
proba /= 20

In [18]:
save_to_file(proba, 'more_35_best_100_000_cycle_stacking.csv')

In [19]:
gc.collect()

9