In [1]:
import sklearn.tree
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation, metrics
from timeit import default_timer as timer

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4



In [2]:
features_pd = pd.read_csv('ring_features_1ex.csv.bz2', compression='bz2')
targets_pd = pd.read_csv('ring_targets_1ex.csv')
full_pd = features_pd.merge(targets_pd)

features = features_pd.as_matrix()[:,1:]
targets = targets_pd.as_matrix()[:,1:].flatten()
print(features.shape, targets.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'ring_features_1ex.csv.bz2'

In [None]:
tmask = np.random.choice([False,True], len(targets), p=[0.25, 0.75])
vmask = np.invert(tmask)
train_X = features[tmask]
train_Y = targets[tmask]
test_X = features[vmask]
test_Y = targets[vmask]

print("Training set: {} items, {:.1f}% valuable".format(train_X.shape[0], 100*(sum(train_Y) / train_Y.shape[0])))
print("Validation set: {} items, {:.1f}% valuable".format(test_X.shape[0], 100*(sum(test_Y) / test_Y.shape[0])))

target = 'valuable'
predictors = [x for x in full_pd.columns if x not in [target, 'Unnamed: 0']]


In [None]:
cv_params = {'colsample_bytree': [0.6,0.7,0.8,0.9]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 100, 'max_depth':9, 'min_child_weight':6, 'gamma':0.15,
              'seed':41083, 'subsample': 0.8, 'colsample_bytree':0.8, 'objective': 'binary:logistic',
              'scale_pos_weight':1}
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), param_grid=cv_params, cv=5, n_jobs=4, iid=False, 
                             scoring='roc_auc')
start = timer()
optimized_GBM.fit(train_X, train_Y)
deltaTime = timer() - start
s = deltaTime % 60
m = (deltaTime // 60) % 60
h = deltaTime // 3600
print("Training done after {}h {}m {}s".format(h,m,s))

In [None]:
optimized_GBM.grid_scores_, optimized_GBM.best_params_, optimized_GBM.best_score_

In [None]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target], eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

dtrain = full_pd[tmask]
params = {**ind_params, **optimized_GBM.best_params_}
model = xgb.XGBClassifier(**params)
modelfit(model, dtrain, predictors)

In [None]:
predictions = model.predict(full_pd[vmask][predictors])
ones = np.asarray([1]*predictions.shape[0])
precision, recall, _ = metrics.precision_recall_curve(test_Y, predictions)
average_precision = metrics.average_precision_score(test_Y, predictions)
#precision_micro, recall_micro, _ = metrics.precision_recall_curve(test_Y.ravel(), predictions.ravel())
#average_precision_micro = metrics.average_precision_score(test_Y.ravel(), predictions.ravel(), average='micro')

true_positives = 0
true_negatives = 0
false_positives = 0
false_negatives = 0
invalid_stuff = 0
for actual, predicted in zip(test_Y, predictions):
    if actual == 1 and predicted == 1:
        true_positives += 1
    elif actual == 0 and predicted == 0:
        true_negatives += 1
    elif actual == 0 and predicted == 1:
        false_positives += 1
    elif actual == 1 and predicted == 0:
        false_negatives += 1
    else:
        invalid_stuff += 1
    
print("Valuable Items Wasted: {} ({:.1f}%)".format(false_negatives, 100*(false_negatives / (false_negatives + true_positives))))
print("Worthless Items Kept: {} ({:.1f}%)".format(false_positives, 100*(false_positives / (false_positives + true_negatives))))

print("Precision:", precision)
print("Recall:", recall)

plt.clf()
plt.plot(recall, precision, lw=2, color='navy', label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('AUC={:0.2f}'.format(average_precision))
plt.legend(loc='lower left')
plt.show()

In [None]:
def modefit(alg, dtrain, dtest, predictors, use_train_cv=True, cv_folds=5, early_stopping_rounds=50):
    if use_train_cv:
        xgb_param = alg.get_xgb_params()
        cvresult = xgb.cv(xgb_param, dtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
        
    alg.fit(dtrain, )

In [None]:
model = sklearn.tree.DecisionTreeClassifier(criterion='gini')
model.fit(train_X, train_Y)
model.score(train_X, train_Y)
prediction = model.predict(test_X)


In [None]:
scale = pd.Series.from_csv('ring_max.csv').as_matrix()

for i,p in enumerate(prediction):
    if p == 1:
        f = test_X[i] * scale
        for j,v in enumerate(f):
            if v > 0:
                print(features_pd.columns[j], v)
        print('------------------------------')
    if i > 10:
        break