In [1]:
#importing data
import pandas as pd
import numpy as np
np.random.seed(1)
df_train = pd.read_csv('Train.csv')
df_test = pd.read_csv('Test.csv')

In [2]:
#creating test and train sets
X=df_train.drop('IsUnderRisk', axis = 1)
y=df_train['IsUnderRisk']

In [3]:
#importing packages
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, make_scorer, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [4]:
#train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = .3, random_state = 42)

#Initializing Classifiers
xgb=XGBClassifier()
rf=RandomForestClassifier()

#param_grid for each classifier
params_class = {xgb : {'eta' : np.linspace(0.1, 1, 10), 
             'max_depth' : range(3, 11)},  
                rf : {'n_estimators' : range(10, 150, 10),
                     'criterion' : ['gini', 'entropy'],
                     'max_depth' : range(2, 11),
                     }
             }
scorer=make_scorer(log_loss)
classifiers = [xgb, rf]
for c in classifiers:
    gcv=GridSearchCV(c, param_grid=params_class[c], scoring=scorer, cv=10, refit = True)
    print("For classifier {}".format(c))
    gcv.fit(X_train, y_train)
    predictions = gcv.predict(X_test)
    print(gcv.score(X_test, y_test))
    print("Best Parameters for {} is {}".format(type(c), gcv.best_params_))
    print('\n\n')

For classifier XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)




5.2974123033404865
Best Parameters for <class 'xgboost.sklearn.XGBClassifier'> is {'eta': 0.1, 'max_depth': 10}



For classifier RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)




4.873569675355496
Best Parameters for <class 'sklearn.ensemble.forest.RandomForestClassifier'> is {'criterion': 'gini', 'max_depth': 2, 'n_estimators': 10}



For classifier LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
6.7807118833639
Best Parameters for <class 'sklearn.linear_model.logistic.LogisticRegression'> is {}







In [7]:
rf=RandomForestClassifier(max_depth = 2, n_estimators=10)
rf.fit(X_train, y_train)
pred=rf.predict(X_test)
accuracy_score(pred, y_test)
submission = rf.predict_proba(df_test)

In [15]:
pd.DataFrame(submission).to_excel('sub1.xlsx', index = False)

In [13]:
pred

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1], dtype=int64)