In [1]:
import numpy as np
seed=123
np.random.seed(seed)

import pandas as pd
pd.options.display.max_rows = 200
pd.options.display.max_columns = 100
pd.set_option('display.float_format', lambda x: '%.3f' % x)

from math import floor

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split

In [6]:
EXPERIMENT_TAG = 'xgboost'

In [2]:
df = pd.read_csv(r'..\data\train.csv', low_memory=False)

In [3]:
feature_vector = [x for x in df.columns if 'target' not in x and 'ID_code' not in x]

In [4]:
X = df[feature_vector].values
y = df['target'].values

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=seed)

In [7]:
model = XGBClassifier(n_estimators=1000)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [8]:
pred = model.predict(X_test)

In [9]:
accuracy_score(pred, y_test)

0.9218666666666666

In [10]:
roc_auc_score(pred, y_test)

0.8581243284812017

In [11]:
test_data = pd.read_csv(r'..\data\test.csv', low_memory = False)

In [12]:
text_X = test_data[feature_vector].values

In [13]:
test_data['target'] = model.predict(text_X)

In [14]:
test_data[['ID_code', 'target']].to_csv(EXPERIMENT_TAG + '_submission.csv', index=False)

In [15]:
test_data.shape

(200000, 202)