In [1]:
import sklearn
import numpy as np
import pandas as pd
import collections

In [2]:
train_file = "data/TrainingSet.csv"
test_file = "data/TestSet.csv"

In [3]:
train_df = pd.read_csv(train_file, header = 0)
test_df = pd.read_csv(test_file, header = 0)

In [4]:
# features = ["SellerClosePercent","Category","StartingBid","EndDay","HitCount","AuctionAvgHitCount","SellerSaleAvgPriceRatio","SellerAvg","SellerItemAvg","AuctionHitCountAvgRatio","IsHOF","AuctionCount","AuctionSaleCount","SellerAuctionCount","SellerAuctionSaleCount","AuctionMedianPrice"]
features = ["SellerClosePercent","Category","StartingBid","EndDay","AvgPrice","HitCount","AuctionAvgHitCount","SellerSaleAvgPriceRatio","SellerAvg","SellerItemAvg","AuctionHitCountAvgRatio","IsHOF","AuctionCount","AuctionSaleCount","SellerAuctionCount","SellerAuctionSaleCount","AuctionMedianPrice"]

### Parse out train data


In [5]:
import math
dayDict = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6}
# dayDict = {"Monday": math.sin(0), "Tuesday": math.sin(1), "Wednesday": math.sin(2), "Thursday": math.sin(3), "Friday": math.sin(4), "Saturday": math.sin(5), "Sunday": math.sin(6)}
train_df["EndDay"] = train_df["EndDay"].map(lambda d: dayDict[d])

In [6]:
sold_train = train_df[train_df.QuantitySold == 1]

In [7]:
sold_train["Profit"] = sold_train.apply(lambda line: int(line["AvgPrice"] < line["Price"]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [8]:
sold_train["Profit"].head()

0    1
1    1
2    0
5    0
9    1
Name: Profit, dtype: int64

In [9]:
profit_train = sold_train[sold_train.Profit == 1]
loss_train = sold_train[sold_train.Profit == 0]

In [10]:
even_train_df = pd.concat([profit_train, loss_train[:len(profit_train)]])

In [11]:
X = even_train_df[features]
y = even_train_df["Profit"]

In [50]:
even_train_df.to_csv("even_train.csv")

### Build model

#### Random Forest

In [12]:
from sklearn import ensemble

In [13]:
rf = ensemble.RandomForestClassifier(n_estimators=10)

In [14]:
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

#### KNN

In [15]:
from sklearn import neighbors

In [16]:
knn = neighbors.KNeighborsClassifier(5)

In [17]:
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

#### Decision Tree

In [18]:
from sklearn import tree

In [19]:
dt = tree.DecisionTreeClassifier()

In [20]:
dt.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

#### Gaussian Naive Bayes

In [21]:
from sklearn import naive_bayes

In [22]:
gnb = naive_bayes.GaussianNB()

In [23]:
gnb.fit(X, y)

GaussianNB(priors=None)

#### ADA Boost

In [24]:
ada = ensemble.AdaBoostClassifier()

In [25]:
ada.fit(X, y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

#### MLP

In [26]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X)  
X_scaler = scaler.transform(X)  

In [27]:
from sklearn import neural_network

In [28]:
mlp = neural_network.MLPClassifier(alpha=1)

In [29]:
mlp.fit(X, y)

MLPClassifier(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

### Parse Test Data

In [30]:
test_df["EndDay"] = test_df["EndDay"].map(lambda d: dayDict[d])

In [31]:
test_sold_df = test_df[test_df.QuantitySold == 1]

In [32]:
test_sold_df["Profit"] = test_sold_df.apply(lambda line: int(line["AvgPrice"] < line["Price"]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [33]:
X_test = test_sold_df[features]
y_test = test_sold_df["Profit"]

In [34]:
from sklearn import metrics

#### Random Forest

In [35]:
rf_predicted = rf.predict(X_test)

In [36]:
print(metrics.classification_report(rf_predicted, y_test))

             precision    recall  f1-score   support

          0       0.81      0.93      0.87      6185
          1       0.81      0.58      0.68      3214

avg / total       0.81      0.81      0.80      9399



#### KNN

In [37]:
knn_predicted = knn.predict(X_test)

In [38]:
print(metrics.classification_report(knn_predicted, y_test))

             precision    recall  f1-score   support

          0       0.67      0.87      0.75      5433
          1       0.69      0.41      0.51      3966

avg / total       0.68      0.67      0.65      9399



#### Decision Tree

In [39]:
dt_predicted = dt.predict(X_test)

In [40]:
print(metrics.classification_report(dt_predicted, y_test))

             precision    recall  f1-score   support

          0       0.77      0.93      0.84      5852
          1       0.83      0.54      0.65      3547

avg / total       0.79      0.78      0.77      9399



#### Gaussian Naive Bayes

In [41]:
gnb_predicted = gnb.predict(X_test)

In [42]:
print(metrics.classification_report(gnb_predicted, y_test))

             precision    recall  f1-score   support

          0       0.89      0.80      0.84      7845
          1       0.33      0.49      0.39      1554

avg / total       0.80      0.75      0.77      9399



#### ADA Boost

In [43]:
ada_predicted = ada.predict(X_test)

In [44]:
print(metrics.classification_report(ada_predicted, y_test))

             precision    recall  f1-score   support

          0       0.77      0.91      0.83      5960
          1       0.77      0.52      0.62      3439

avg / total       0.77      0.77      0.75      9399



#### MLP

In [45]:
X_test_scaler = scaler.transform(X_test)

In [46]:
mlp_predicted = mlp.predict(X_test_scaler)

In [47]:
print(metrics.classification_report(mlp_predicted, y_test))

             precision    recall  f1-score   support

          0       0.97      0.79      0.87      8748
          1       0.20      0.71      0.31       651

avg / total       0.92      0.78      0.83      9399

