In [1]:
#To make predictions I will use a gradient boosting method.
#For more info see: https://en.wikipedia.org/wiki/Gradient_boosting

import pandas as pd
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

loc = r"C:\Users\me\Documents\datasets\black_train_for_eda.csv"
loc2 = r"C:\Users\me\Documents\datasets\black_test_for_eda.csv"

train = pd.read_csv(loc)
test = pd.read_csv(loc2)

In [2]:
#Shuffle train set. 
#I'm taking a 60k sample of it for computational efficiency for this demonstration. 
#If you have a powerful enough computer feel free to use all of the training data.

train = train.sample(frac=1).reset_index(drop=True)
train = train[:60000]

In [3]:
#Remove unnecessary columns.

train = train.drop(['User_ID','Product_ID'], axis=1)
submit = pd.DataFrame(test, columns=['User_ID','Product_ID'])
test = test.drop(['User_ID','Product_ID'], axis=1) 

In [4]:
#Create valuation set and separate target from training and valuation set.

val = train[50000:60000]
train = train[:50000]
y_train = train.Purchase
y_val = val.Purchase

X_train = train.drop('Purchase', axis=1)
X_val = val.drop('Purchase', axis=1)

In [None]:
#Create a function to find optimal hyperparameters for our model. The reason we have to
#use a function is due to the way Python uses parallelization on Windows.
#https://www.kaggle.com/c/malware-classification/forums/t/12802/
#anyone-getting-parallelizing-error-for-scikit-learn-based-models-in-python/66187#post66187

#Note numerous different values were used in the param_grid to hone in on the best paramater
#combinations. The param grid below is what I ended up with after running the model several 
#times (50ish!)

def model(X_train, X_val, y_train, y_val):
    if __name__ == '__main__':
        
        param_grid = {'learning_rate': [0.047],
                      'max_depth': [9],
                      'min_samples_leaf': [18],
                      'max_features': [0.88],
                      'n_estimators': [100, 300, 500]
                      }

        estimator = GridSearchCV(estimator=GradientBoostingRegressor(),
                                 param_grid=param_grid,
                                 n_jobs=-1)

        estimator.fit(X_train, y_train)

        best_params = estimator.best_params_
                                 
        validation_accuracy = estimator.score(X_val, y_val)
        print('Validation accuracy: ', validation_accuracy)
        return best_params

In [None]:
#Fit model with best paramters then make predictions on the test set.

model = GradientBoostingRegressor(learning_rate=0.047,
                          max_depth=9,
                          min_samples_leaf=18,
                          max_features=0.88,
                          n_estimators=300)

model.fit(X_train, y_train)

preds = model.predict(test)

In [None]:
#Create dataframe for submission to the data mining competition.

submit = pd.DataFrame({'User_ID' : User_ID,
                       'Product_ID' : Product_ID,
                       'Purchase' : preds})

submit = submit.reindex_axis(['User_ID', 'Product_ID', 'Purchase'], axis=1)

In [None]:
#This model is the best that I have came up with so far. It scored 4223.02891 on the 
#leaderboard, beating the baseline model (4982.31994).

In [None]:
submit.Purchase.describe()

In [None]:
#The min value here is -1956.66, clearly negative values are errors. We can't have negative
#purchase values!

In [None]:
submit.Purchase.hist(bins=30)

In [None]:
#Here we can see that there are a fair amount of impossible predictions.
#Let's set the negative predictions to different values and see if our score improves.

In [None]:
#After testing various different numbers the best turned out to be 11,000.
submit.Purchase[submit.Purchase == 9000] = 11000

#Best score: 4052.9933497