In [1]:
import sklearn
import numpy as np
import pandas as pd
import collections

### Import Train and Test data

In [2]:
train_file = "data/even_train.csv"
test_file = "data/TestSet.csv"

In [3]:
train_df = pd.read_csv(train_file, header = 0)
test_df = pd.read_csv(test_file, header = 0)

In [4]:
features = ["Price","PricePercent","StartingBidPercent","SellerClosePercent","Category","StartingBid","AvgPrice","EndDay","HitCount","AuctionAvgHitCount","AuctionHitCountAvgRatio","BestOffer","AuctionCount","AuctionSaleCount","AuctionMedianPrice"]

### Parse out train data

In [6]:
import math
dayDict = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6}

In [7]:
X = train_df[features]
y = train_df["QuantitySold"]

### Build model

#### Random Forest

In [10]:
from sklearn import ensemble

In [11]:
rf = ensemble.RandomForestClassifier(n_estimators=10)

In [12]:
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

#### KNN

In [13]:
from sklearn import neighbors

In [14]:
knn = neighbors.KNeighborsClassifier(5)

In [15]:
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

#### Decision Trees

In [16]:
from sklearn import tree

In [17]:
dt = tree.DecisionTreeClassifier()

In [18]:
dt.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

#### Gaussian Naive Bayes

In [19]:
from sklearn import naive_bayes

In [21]:
gnb = naive_bayes.GaussianNB()

In [22]:
gnb.fit(X, y)

GaussianNB(priors=None)

#### ADA Boost

In [23]:
ada = ensemble.AdaBoostClassifier()

In [24]:
ada.fit(X, y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

#### MLP

In [25]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X)  
X_scaler = scaler.transform(X)  

In [26]:
from sklearn import neural_network

In [27]:
mlp = neural_network.MLPClassifier(solver='lbfgs')

In [28]:
mlp.fit(X_scaler, y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

#### Gradient Boosting Classifiers

In [45]:
from sklearn.ensemble import GradientBoostingClassifier

In [46]:
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)

In [49]:
gbc.fit(X_scaler, y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=1,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False)

#### Voting Classifier

In [52]:
from sklearn.ensemble import VotingClassifier

In [53]:
vc = VotingClassifier(estimators=[('rf', rf), ('dt', dt), ('ada', ada), ('mlp', mlp), ('gbc', gbc)], voting='hard')

In [55]:
vc.fit(X_scaler, y)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_...ors=100, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False))],
         n_jobs=1, voting='hard', weights=None)

In [58]:
vc_soft = VotingClassifier(estimators=[('rf', rf), ('dt', dt), ('ada', ada), ('mlp', mlp), ('gbc', gbc)], voting='soft')

In [59]:
vc_soft.fit(X_scaler, y)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_...ors=100, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False))],
         n_jobs=1, voting='soft', weights=None)

### Parse Test Data

In [29]:
test_df["EndDay"] = test_df["EndDay"].map(lambda d: dayDict[d])

In [30]:
X_test = test_df[features]
y_test = test_df["QuantitySold"]

In [31]:
from sklearn import metrics

#### Random Forest


In [32]:
rf_predicted = rf.predict(X_test)

In [33]:
print(metrics.classification_report(rf_predicted, y_test))

             precision    recall  f1-score   support

          0       0.91      0.93      0.92     27539
          1       0.80      0.76      0.78      9921

avg / total       0.88      0.88      0.88     37460



#### KNN

In [34]:
knn_predicted = knn.predict(X_test)

In [35]:
print(metrics.classification_report(knn_predicted, y_test))

             precision    recall  f1-score   support

          0       0.74      0.91      0.82     22926
          1       0.78      0.50      0.61     14534

avg / total       0.76      0.75      0.74     37460



#### Decision Tree

In [36]:
dt_predicted = dt.predict(X_test)

In [37]:
print(metrics.classification_report(dt_predicted, y_test))

             precision    recall  f1-score   support

          0       0.84      0.92      0.88     25439
          1       0.80      0.62      0.70     12021

avg / total       0.83      0.83      0.82     37460



#### Gaussian Naive Bayes

In [38]:
gnb_predicted = gnb.predict(X_test)

In [39]:
print(metrics.classification_report(gnb_predicted, y_test))

             precision    recall  f1-score   support

          0       0.19      0.97      0.32      5420
          1       0.99      0.29      0.45     32040

avg / total       0.87      0.39      0.43     37460



#### ADA Boost


In [40]:
ada_predicted = ada.predict(X_test)

In [41]:
print(metrics.classification_report(ada_predicted, y_test))

             precision    recall  f1-score   support

          0       0.91      0.93      0.92     27387
          1       0.81      0.75      0.78     10073

avg / total       0.88      0.89      0.88     37460



#### MLP

In [42]:
X_test_scaler = scaler.transform(X_test)

In [43]:
mlp_predicted = mlp.predict(X_test_scaler)

In [44]:
print(metrics.classification_report(mlp_predicted, y_test))

             precision    recall  f1-score   support

          0       0.92      0.94      0.93     27666
          1       0.82      0.78      0.80      9794

avg / total       0.90      0.90      0.90     37460



#### Gradient Boosting Classifiers

In [50]:
gbc_predicted = gbc.predict(X_test_scaler)

In [51]:
print(metrics.classification_report(gbc_predicted, y_test))

             precision    recall  f1-score   support

          0       0.92      0.93      0.93     27610
          1       0.81      0.77      0.79      9850

avg / total       0.89      0.89      0.89     37460



#### Voting Classifier

In [56]:
vc_predicted = vc.predict(X_test_scaler)

In [57]:
print(metrics.classification_report(vc_predicted, y_test))

             precision    recall  f1-score   support

          0       0.92      0.94      0.93     27729
          1       0.81      0.78      0.80      9731

avg / total       0.90      0.90      0.90     37460



In [60]:
vc_soft_predicted = vc_soft.predict(X_test_scaler)

In [61]:
print(metrics.classification_report(vc_soft_predicted, y_test))

             precision    recall  f1-score   support

          0       0.91      0.94      0.92     27200
          1       0.82      0.75      0.79     10260

avg / total       0.89      0.89      0.89     37460

