In [88]:
import sklearn
import numpy as np
import pandas as pd
import collections

In [89]:
train_file = "data/TrainingSet.csv"
test_file = "data/TestSet.csv"

In [90]:
train_df = pd.read_csv(train_file, header = 0)
test_df = pd.read_csv(test_file, header = 0)

In [91]:
features = ["SellerClosePercent","Category","StartingBid","EndDay","AvgPrice","HitCount","AuctionAvgHitCount","SellerSaleAvgPriceRatio","SellerAvg","SellerItemAvg","AuctionHitCountAvgRatio","IsHOF","AuctionCount","AuctionSaleCount","SellerAuctionCount","SellerAuctionSaleCount","AuctionMedianPrice"]

### Parse out train data


In [92]:
import math
dayDict = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6}
train_df["EndDay"] = train_df["EndDay"].map(lambda d: dayDict[d])

In [93]:
sold_train = train_df[train_df.QuantitySold == 1]

In [94]:
sold_train["Profit"] = sold_train.apply(lambda line: int(line["AvgPrice"] < line["Price"]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [95]:
sold_train["Profit"].head()

0    1
1    1
2    0
5    0
9    1
Name: Profit, dtype: int64

In [96]:
profit_train = sold_train[sold_train.Profit == 1]
loss_train = sold_train[sold_train.Profit == 0]

In [97]:
even_train_df = pd.concat([profit_train, loss_train[:len(profit_train)]])

In [98]:
X = even_train_df[features]
y = even_train_df["Profit"]

### Build model

#### Random Forest

In [99]:
from sklearn import ensemble

In [100]:
rf = ensemble.RandomForestClassifier(n_estimators=10)

In [101]:
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

#### KNN

In [102]:
from sklearn import neighbors

In [103]:
knn = neighbors.KNeighborsClassifier(5)

In [104]:
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

#### Decision Tree

In [105]:
from sklearn import tree

In [106]:
dt = tree.DecisionTreeClassifier()

In [107]:
dt.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

#### Gaussian Naive Bayes

In [108]:
from sklearn import naive_bayes

In [109]:
gnb = naive_bayes.GaussianNB()

In [110]:
gnb.fit(X, y)

GaussianNB(priors=None)

#### ADA Boost

In [111]:
ada = ensemble.AdaBoostClassifier()

In [112]:
ada.fit(X, y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

#### MLP

In [113]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X)  
X_scaler = scaler.transform(X)  

In [114]:
from sklearn import neural_network

In [115]:
mlp = neural_network.MLPClassifier(alpha=1)

In [116]:
mlp.fit(X, y)

MLPClassifier(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

#### Gradient Boosting Classifiers

In [135]:
from sklearn.ensemble import GradientBoostingClassifier

In [136]:
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)

In [137]:
gbc.fit(X_scaler, y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=1,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False)

#### Voting Classifier

In [140]:
from sklearn.ensemble import VotingClassifier

In [141]:
vc = VotingClassifier(estimators=[('rf', rf), ('dt', dt), ('ada', ada), ('gbc', gbc)], voting='hard')

In [142]:
vc.fit(X_scaler, y)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_...ors=100, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False))],
         n_jobs=1, voting='hard', weights=None)

In [145]:
vc_soft = VotingClassifier(estimators=[('rf', rf), ('dt', dt), ('ada', ada), ('gbc', gbc)], voting='soft')

In [146]:
vc_soft.fit(X_scaler, y)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_...ors=100, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False))],
         n_jobs=1, voting='soft', weights=None)

In [149]:
vc_gaussian = VotingClassifier(estimators=[('rf', rf), ('dt', dt), ('ada', ada), ('gnb', gnb), ('gbc', gbc)], voting='hard')

In [150]:
vc_gaussian.fit(X_scaler, y)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_...ors=100, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False))],
         n_jobs=1, voting='hard', weights=None)

### Parse Test Data

In [117]:
test_df["EndDay"] = test_df["EndDay"].map(lambda d: dayDict[d])

In [118]:
test_sold_df = test_df[test_df.QuantitySold == 1]

In [119]:
test_sold_df["Profit"] = test_sold_df.apply(lambda line: int(line["AvgPrice"] < line["Price"]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [120]:
X_test = test_sold_df[features]
y_test = test_sold_df["Profit"]

In [121]:
from sklearn import metrics

#### Random Forest

In [122]:
rf_predicted = rf.predict(X_test)

In [123]:
print(metrics.classification_report(rf_predicted, y_test))

             precision    recall  f1-score   support

          0       0.82      0.93      0.87      6233
          1       0.81      0.59      0.69      3166

avg / total       0.82      0.82      0.81      9399



#### KNN

In [124]:
knn_predicted = knn.predict(X_test)

In [125]:
print(metrics.classification_report(knn_predicted, y_test))

             precision    recall  f1-score   support

          0       0.67      0.87      0.75      5433
          1       0.69      0.41      0.51      3966

avg / total       0.68      0.67      0.65      9399



#### Decision Tree

In [126]:
dt_predicted = dt.predict(X_test)

In [127]:
print(metrics.classification_report(dt_predicted, y_test))

             precision    recall  f1-score   support

          0       0.77      0.93      0.84      5835
          1       0.83      0.54      0.65      3564

avg / total       0.79      0.78      0.77      9399



#### Gaussian Naive Bayes

In [128]:
gnb_predicted = gnb.predict(X_test)

In [129]:
print(metrics.classification_report(gnb_predicted, y_test))

             precision    recall  f1-score   support

          0       0.89      0.80      0.84      7845
          1       0.33      0.49      0.39      1554

avg / total       0.80      0.75      0.77      9399



#### ADA Boost

In [130]:
ada_predicted = ada.predict(X_test)

In [131]:
print(metrics.classification_report(ada_predicted, y_test))

             precision    recall  f1-score   support

          0       0.77      0.91      0.83      5960
          1       0.77      0.52      0.62      3439

avg / total       0.77      0.77      0.75      9399



#### MLP

In [132]:
X_test_scaler = scaler.transform(X_test)

In [133]:
mlp_predicted = mlp.predict(X_test_scaler)

In [134]:
print(metrics.classification_report(mlp_predicted, y_test))

             precision    recall  f1-score   support

          0       0.86      0.83      0.84      7354
          1       0.45      0.51      0.48      2045

avg / total       0.77      0.76      0.76      9399



#### Gradient Boosting Classifiers

In [138]:
gbc_predicted = gbc.predict(X_test_scaler)

In [139]:
print(metrics.classification_report(gbc_predicted, y_test))

             precision    recall  f1-score   support

          0       0.77      0.91      0.84      5988
          1       0.77      0.52      0.62      3411

avg / total       0.77      0.77      0.76      9399



#### Voting Classifier

In [143]:
vc_predicted = vc.predict(X_test_scaler)

In [144]:
print(metrics.classification_report(vc_predicted, y_test))

             precision    recall  f1-score   support

          0       0.85      0.91      0.88      6592
          1       0.75      0.62      0.68      2807

avg / total       0.82      0.82      0.82      9399



In [147]:
vc_soft_predicted = vc_soft.predict(X_test_scaler)

In [148]:
print(metrics.classification_report(vc_soft_predicted, y_test))

             precision    recall  f1-score   support

          0       0.79      0.94      0.86      5933
          1       0.84      0.56      0.67      3466

avg / total       0.81      0.80      0.79      9399



In [151]:
vc_gaussian_predicted = vc_gaussian.predict(X_test_scaler)

In [152]:
print(metrics.classification_report(vc_gaussian_predicted, y_test))

             precision    recall  f1-score   support

          0       0.82      0.92      0.87      6367
          1       0.77      0.59      0.67      3032

avg / total       0.81      0.81      0.80      9399

