In [1]:
import numpy as np 
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
#import matplotlib.pyplot as plt
#from functools import reduce
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
#from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score,log_loss,confusion_matrix, plot_confusion_matrix,plot_roc_curve,f1_score

### Data

In [2]:
INPUT_PATH = "../../kkbox-churn-prediction-challenge/50_under_sample/"
FILE_NAME = 'train_all.csv'

In [3]:
df_train = pd.read_csv(INPUT_PATH + FILE_NAME).sort_values('msno')

In [4]:
df_train.shape

(53483, 66)

In [5]:
cat_attribs=['city','gender','registered_via','payment_method_id_most_common','payment_plan_days_most_common',
             #'plan_list_price_most_common',
             #'actual_amount_paid_most_common',
             'is_auto_renew_change_or_not','is_auto_renew_most_common',
             'is_cancel_change_or_not', 'discount_positive_binary_has', 
             'free_binary_has','potential_churn_binary_has']
num_attribs = ['register_days','date_count',
               'num_25_sum','num_50_mean','num_75_sum','num_75_std',
               'num_985_sum','num_985_std','num_100_sum', 'num_100_std','num_unq_mean','total_secs_sum', 
               'num_25_mean', 'num_25_std','num_50_sum','num_50_std', 
               'num_75_mean', 'num_985_mean', 
               'num_100_mean', 'num_unq_sum', 'num_unq_std',
               'total_secs_mean', 'total_secs_std',
               'payment_method_id_nunique','payment_plan_days_nunique',
               'plan_list_price_nunique','plan_list_price_mean','plan_list_price_std', # correlation
               'plan_list_price_most_common',# this originally is used as categorical
               'actual_amount_paid_most_common', # this originally is used as categorical
               'actual_amount_paid_nunique','actual_amount_paid_mean', 'actual_amount_paid_std',
               'is_auto_renew_pcet_of_zero','is_cancel_mean','discount_amount_find_positive_pct',
               'discount_amount_mean', 'free_mean', 
               'discount_amount_sum',
               'length_mean', 'length_sum','length_std', # correlation
               'amtperday_mean','amtperday_std', 
               'd_transaction_mean', 'd_transaction_std',
               'd_expire_date_mean', 'd_expire_date_std','membership_days', 
               'listen_member_pct','register_member_sub', # correlation
               'member_register_pct','bd']

In [6]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

In [7]:
cat_pipeline = Pipeline([
    ('selector',DataFrameSelector(cat_attribs)),
    ('onehot',OneHotEncoder(drop="first",categories="auto"))])
num_pipeline = Pipeline([
    ('selector',DataFrameSelector(num_attribs)),
    #('std_scaler',StandardScaler())
])
full_pipeline = FeatureUnion(transformer_list=[
    ("cat_pipeline",cat_pipeline),
    ("num_pipeline",num_pipeline)
])

In [8]:
X_train=df_train.iloc[:,2:66]
X_train=X_train.fillna(0)

In [9]:
X_prepared = full_pipeline.fit_transform(X_train)

In [10]:
y_train=df_train.iloc[:,1]

### Model

In [12]:
rf = RandomForestClassifier(max_depth=2, random_state=0)

In [13]:
rf.fit(X_prepared, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [26]:
f1_score(y_train,clf.predict(X_prepared))

0.6420770657325366

### Tune

In [20]:
parameters = {'n_estimators':[100,500,1000], 'max_features':['auto'],
              'max_depth':[2,10],'min_samples_split':[2, 5, 10],
              'min_samples_leaf':[1, 2, 4],'bootstrap':[True]}
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters)

In [22]:
rf1 = RandomForestClassifier(n_estimators=1000,max_depth=2,min_samples_split=2,min_samples_leaf=5)

In [23]:
rf1.fit(X_prepared, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
f1_score(y_train,rf1.predict(X_prepared))

0.6492301485914072

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 50)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [12]:
rfc = RandomForestClassifier()

In [15]:
rf_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 10, cv = 2, verbose=2, random_state=42)#, n_jobs = -1)

In [16]:
rf_random.fit(X_prepared, y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] n_estimators=934, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=60, bootstrap=False 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=934, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=60, bootstrap=False, total= 6.8min
[CV] n_estimators=934, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=60, bootstrap=False 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.8min remaining:    0.0s


[CV]  n_estimators=934, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=60, bootstrap=False, total= 6.0min
[CV] n_estimators=363, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=True 


KeyboardInterrupt: 

In [None]:
rf_random.best_params_

### Validation

In [25]:
FILE_NAME = 'val_all.csv'
df_val = pd.read_csv(INPUT_PATH + FILE_NAME).sort_values('msno')

X_val = df_val.iloc[:,2:66]
X_val=X_val.fillna(0)

y_val = df_val.iloc[:,1]

X_val_prepared = full_pipeline.transform(X_val)

In [26]:
f1_score(y_val,rf1.predict(X_val_prepared))

0.296967309964553