In [27]:
# Import packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from patsy import dmatrices

In [28]:
dta = pd.read_csv('../data/weekData.csv')

In [29]:
dta.describe()

Unnamed: 0.1,Unnamed: 0,household_key,QUANTITY,BASE_SPEND_AMT,NET_SPEND_AMT,LOY_CARD_DISC,COUPON_DISC,GET_EGGS,WEEK,NEXT_EGGS,X,BASKET_ID,PRODUCT_ID
count,27076.0,27076.0,27076.0,27076.0,27076.0,27076.0,27076.0,27076.0,27076.0,27076.0,27076.0,27076.0,27076.0
mean,13538.5,504.015512,13.229908,38.945442,33.333592,-5.479414,-0.132436,0.217868,14.5,0.133033,27661.825233,35680140000.0,1196953.0
std,7816.312281,289.027343,21.843034,65.189523,55.231095,10.51018,0.971411,0.412805,8.077896,0.37428,45630.758174,1990048000.0,1682116.0
min,1.0,1.0,0.0,0.0,0.0,-455.26,-48.88,0.0,1.0,0.0,1.0,34577530000.0,32916.0
25%,6769.75,253.0,0.0,0.0,0.0,-6.97,0.0,0.0,7.75,0.0,3019.0,34642840000.0,831390.0
50%,13538.5,506.0,2.0,5.08,4.575,0.0,0.0,0.0,14.5,0.0,9927.0,34837410000.0,856716.0
75%,20307.25,756.0,19.0,54.51,46.905,0.0,0.0,0.0,21.25,0.0,28176.0,35386820000.0,929433.0
max,27076.0,1000.0,286.0,1551.11,1046.97,0.0,0.0,1.0,28.0,4.0,278461.0,42097840000.0,15741670.0


In [30]:
dta = dta.drop('Unnamed: 0', 1)

In [31]:
# Create X and y matrices
# X is the input feature matrix
# y is the column vector of the parameter we want to predict

y, X = dmatrices('NEXT_EGGS ~ QUANTITY + BASE_SPEND_AMT + NET_SPEND_AMT + LOY_CARD_DISC + COUPON_DISC + GET_EGGS', dta, return_type="dataframe")

In [32]:
# Percentage of data with eggs

print(y['NEXT_EGGS'].value_counts())
freq = y.sum(axis=0)[0] / y.shape[0]
print(freq)

0.0    23783
1.0     3007
2.0      266
3.0       17
4.0        3
Name: NEXT_EGGS, dtype: int64
0.133032944305


In [33]:
# Cross Validation folds with 20/80 split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [34]:
# Baseline predictions 
# Guessing 0 for everything

baselinepredict = [0 for x in range(0, y_test.shape[0])]

In [35]:
# Briar score on the xvaldiation

metrics.brier_score_loss(y_test, baselinepredict)

0.0009231905465288035

In [36]:
# Logistic regression model

model = LogisticRegression()
model.fit(X_train, y_train)
logisticpredict = model.predict(X_test)

  y = column_or_1d(y, warn=True)


In [37]:
# How many 1's and 0's of the prediction

print(np.count_nonzero(logisticpredict==1))
print(np.count_nonzero(logisticpredict==0))

800
4616


In [38]:
# Briar score on the xvalidation

metrics.brier_score_loss(y_test, logisticpredict)

0.1475258493353028

In [39]:
# Random forest model

model2 = RandomForestClassifier()
model2.fit(X_train, y_train)
forestpredict = model2.predict(X_test)



In [40]:
# How many 1's and 0's of the prediction

print(np.count_nonzero(forestpredict==1))
print(np.count_nonzero(forestpredict==0))

570
4839


In [41]:
# Briar score on the xvalidation

metrics.brier_score_loss(y_test, forestpredict)

ValueError: y_prob contains values greater than 1.

In [42]:
import xgboost as xgb

In [43]:
# XGB Boost model

gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X_train, y_train)
xgbpredict = gbm.predict(X_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [44]:
# How many 1's and 0's of the prediction

print(np.count_nonzero(xgbpredict==1))
print(np.count_nonzero(xgbpredict==0))

505
4911


In [45]:
metrics.brier_score_loss(y_test, xgbpredict)

0.093057607090103397

In [47]:
df = pd.read_csv('../data/finalWeek.csv')


In [56]:
yans, Xans = dmatrices('NEXT_EGGS ~ QUANTITY + BASE_SPEND_AMT + NET_SPEND_AMT + LOY_CARD_DISC + COUPON_DISC + GET_EGGS', df, return_type="dataframe")

In [57]:
anspredict = gbm.predict(Xans)

In [58]:
print(len(Xans))

967


In [53]:
print(np.count_nonzero(anspredict==1))
print(np.count_nonzero(anspredict==0))

2386
24689


In [26]:
np.savetxt("../predictions.csv", xgbpredict, delimiter=",")