In [52]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [6]:
df = pd.read_csv('topics_df.zip', compression='zip')
del df['id']

In [16]:
X = df.drop('score', axis=1)
Y = df['score']
labels = [0, 1]
Y = pd.cut(Y, [0, 250, 100000], include_lowest=True, labels=labels)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state=42)

In [18]:
xgbmodel = XGBClassifier(
    learning_rate =0.1,
     n_estimators=1000,
     max_depth=5,
     min_child_weight=1,
     subsample=0.7,
     colsample_bytree=0.7,
     objective= 'binary:logistic',
     nthread=3,
     seed=42)

In [23]:
xgbmodel.fit(x_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=3,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.7)

In [38]:
train_predictions = xgbmodel.predict(x_train)
test_probs = xgbmodel.predict_proba(x_test)[:,1]
labels = [0, 1]
y_pred = pd.cut(test_probs, [0, 0.057, 1], include_lowest=True, labels=labels)
    

In [48]:
len(labels)

2

In [53]:
print(classification_report(y_test, np.array(y_pred), labels=labels))
print(confusion_matrix(y_test, y_pred, labels=labels))

             precision    recall  f1-score   support

          0       0.96      0.78      0.86     41896
          1       0.09      0.40      0.15      2418

avg / total       0.91      0.76      0.82     44314

[[32607  9289]
 [ 1443   975]]


In [54]:
def fitXGB(xgbmodel, x_train, x_test, y_train, y_test, cutoff = 0.056, print_report = True):
    xgbmodel.fit(x_train, y_train)
    test_probs = xgbmodel.predict_proba(x_test)[:,1]
    labels = [0, 1]
    y_pred = pd.cut(test_probs, [0, cutoff, 1], include_lowest=True, labels=labels)
    
    if print_report:
        print(classification_report(y_test, y_pred, labels=labels))
        print(confusion_matrix(y_test, y_pred, labels=labels))
    else:
        return accuracy_score(y_test, y_pred)

In [55]:
fitXGB(xgbmodel, x_train, x_test, y_train, y_test)

             precision    recall  f1-score   support

          0       0.96      0.77      0.85     41896
          1       0.09      0.41      0.15      2418

avg / total       0.91      0.75      0.82     44314

[[32299  9597]
 [ 1422   996]]
