# Give me some credit: model trainfull/test with sklearn (scikit-learn)

## Load prepared datasets

In [1]:
from pandas import read_csv
test = read_csv('../handson-ml2/test.csv', index_col=0)
trainfull = read_csv('trainfull.csv', index_col=0)

When using most ML libraries in Python, we need to present data as separate variables (arrays) for inputs and outputs, e.g. X_train and y_train for training data.

Let's start with outputs:

In [3]:
target_column = 'SeriousDlqin2yrs'
y_trainfull = trainfull[target_column].values
print(y_trainfull)

[1 0 0 ... 0 0 0]


Inputs:

In [4]:
from pprint import pprint
X_trainfull = trainfull.drop(target_column, axis=1).values
pprint(X_trainfull)

array([[1.00000000e+00, 7.66126609e-01, 4.50000000e+01, ...,
        7.00000000e+00, 7.32319702e+03, 1.79680298e+03],
       [2.00000000e+00, 9.57151019e-01, 4.00000000e+01, ...,
        4.00000000e+00, 3.16878123e+02, 2.28312188e+03],
       [3.00000000e+00, 6.58180140e-01, 3.80000000e+01, ...,
        2.00000000e+00, 2.58914887e+02, 2.78308511e+03],
       ...,
       [1.49998000e+05, 2.46043918e-01, 5.80000000e+01, ...,
        1.70000000e+01, 0.00000000e+00, 0.00000000e+00],
       [1.49999000e+05, 0.00000000e+00, 3.00000000e+01, ...,
        4.00000000e+00, 0.00000000e+00, 5.71600000e+03],
       [1.50000000e+05, 8.50282951e-01, 6.40000000e+01, ...,
        6.00000000e+00, 2.03875009e+03, 6.11924991e+03]])


Likewise for test data:

In [5]:
X_test = test.drop(target_column, axis=1).values
y_test = test[target_column].values

In [6]:
from xgboost import XGBClassifier
model = XGBClassifier()

In [7]:
model.fit(X_trainfull, y_trainfull)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

## Apply model to test set

In [12]:
y_test_proba = model.predict_proba(X_test)[:,1]
print(y_test_proba)

[0.06173703 0.0376909  0.01267932 ... 0.00513491 0.08947054 0.04383596]


In [14]:
X_test

array([[1.00000000e+00, 8.85519080e-01, 4.30000000e+01, ...,
        4.00000000e+00, 1.01182249e+03, 4.68817751e+03],
       [2.00000000e+00, 4.63295269e-01, 5.70000000e+01, ...,
        1.10000000e+01, 4.81947276e+03, 4.32152724e+03],
       [3.00000000e+00, 4.32750360e-02, 5.90000000e+01, ...,
        1.10000000e+01, 3.49531235e+03, 1.58768765e+03],
       ...,
       [1.01501000e+05, 8.15963730e-02, 7.00000000e+01, ...,
        3.00000000e+00,            nan,            nan],
       [1.01502000e+05, 3.35456547e-01, 5.60000000e+01, ...,
        6.00000000e+00,            nan,            nan],
       [1.01503000e+05, 4.41841663e-01, 2.90000000e+01, ...,
        1.20000000e+01, 1.17680108e+03, 4.73919892e+03]])

In [30]:
Id = X_test[:, [0]].astype(int)
print(Id)

[[     1]
 [     2]
 [     3]
 ...
 [101501]
 [101502]
 [101503]]


## Submission to kaggle

Transform into a dataframe:

In [49]:
import pandas as pd
import numpy as np
df = pd.DataFrame(data=np.column_stack((Id,y_test_proba)),columns=['Id','Probability'])
df.Id = df['Id'].astype(int)
df

Unnamed: 0,Id,Probability
0,1,0.061737
1,2,0.037691
2,3,0.012679
3,4,0.070756
4,5,0.078350
5,6,0.026118
6,7,0.051590
7,8,0.038601
8,9,0.005113
9,10,0.559284


In [51]:
df.to_csv("submitkaggle.csv", index=False)

In [52]:
submit = read_csv('../handson-ml2/submitkaggle.csv')

In [53]:
submission_file = "submitkaggle.csv"

In [55]:
import kaggle
kaggle.api.competition_submit(submission_file, "XGBClassifier", "GiveMeSomeCredit")

100%|██████████| 2.55M/2.55M [00:04<00:00, 639kB/s] 


Successfully submitted to Give Me Some Credit