In [1]:
import sklearn
import numpy as np
import pandas as pd
import collections

### Import Train and Test data

In [2]:
train_file = "data/TrainingSet.csv"
test_file = "data/TestSet.csv"

In [3]:
train_df = pd.read_csv(train_file, header = 0)
test_df = pd.read_csv(test_file, header = 0)

In [4]:
# hand picked features
# features = ["AuctionMedianPrice", "Price","PricePercent","StartingBidPercent","SellerClosePercent","StartingBid","AvgPrice","HitCount","AuctionAvgHitCount","ItemAuctionSellPercent","SellerSaleAvgPriceRatio","AuctionHitCountAvgRatio","BestOffer", "IsHOF","ItemListedCount","AuctionCount","AuctionSaleCount","SellerAuctionCount","SellerAuctionSaleCount"]
# features = ["AuctionMedianPrice", "Price", "AvgPrice", "ItemAuctionSellPercent", "StartingBidPercent", "StartingBid", "AuctionHitCountAvgRatio", "SellerSaleAvgPriceRatio", "IsHOF", "AuctionCount", "SellerAuctionSaleCount"]

# almost all features
features = ["Price","PricePercent","StartingBidPercent","SellerClosePercent","Category","StartingBid","AvgPrice","EndDay","HitCount","AuctionAvgHitCount","AuctionHitCountAvgRatio","BestOffer","AuctionCount","AuctionSaleCount","AuctionMedianPrice"]

# no endday
# features = ["Price","PricePercent","StartingBidPercent","SellerClosePercent","Category","StartingBid","AvgPrice","HitCount","AuctionAvgHitCount","AuctionHitCountAvgRatio","BestOffer","AuctionCount","AuctionSaleCount","AuctionMedianPrice"]

# took out a lot of features
# features = ["Price","StartingBid","AvgPrice","EndDay","HitCount","AuctionAvgHitCount","AuctionHitCountAvgRatio","BestOffer","AuctionCount","AuctionSaleCount","AuctionMedianPrice"]

In [5]:
len(test_df)
yes_test = test_df[test_df.QuantitySold == 1]
len(yes_test)

9399

### Parse out train data

In [6]:
import math
# dayDict = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6}
dayDict = {"Monday": math.sin(0), "Tuesday": math.sin(1), "Wednesday": math.sin(2), "Thursday": math.sin(3), "Friday": math.sin(4), "Saturday": math.sin(5), "Sunday": math.sin(6)}
train_df["EndDay"] = train_df["EndDay"].map(lambda d: dayDict[d])

In [7]:
sold = train_df[train_df.QuantitySold == 1]
not_sold = train_df[train_df.QuantitySold == 0]

In [8]:
len(sold), len(not_sold)

(79755, 178833)

In [9]:
even_train_df = pd.concat([sold, not_sold[:len(sold)]])
even_train_df.head()

Unnamed: 0,EbayID,QuantitySold,Price,PricePercent,StartingBidPercent,SellerName,SellerClosePercent,Category,PersonID,StartingBid,...,AuctionHitCountAvgRatio,BestOffer,ReturnsAccepted,IsHOF,ItemListedCount,AuctionCount,AuctionSaleCount,SellerAuctionCount,SellerAuctionSaleCount,AuctionMedianPrice
0,160983189073,1,27.25,1.4787,0.0537,petesandi,0.925926,73396,9174,0.99,...,29,1.540541,0,1,1,583,291,54,50,12.26
1,390549601720,1,43.0,1.4422,0.335,graphn4fun,0.990566,27278,11046,9.99,...,55,0.404762,0,0,0,120,43,106,105,24.45
2,200901284676,1,11.0,0.5969,0.5426,hondo19461946,0.422131,73396,9174,10.0,...,21,0.324324,0,1,1,583,291,244,103,12.26
5,390551441929,1,33.01,0.8372,0.0251,firstclassautograph,0.664516,27276,22954,0.99,...,114,0.256098,0,1,0,47,23,155,103,41.0
9,121077510308,1,180.0,3.483,3.483,emmyjojo200011,0.333333,27278,20913,180.0,...,52,0.183333,0,0,1,1586,447,3,1,36.0


In [10]:
X = even_train_df[features]
y = even_train_df["QuantitySold"]

### Build model

#### SVM

In [None]:
from sklearn import svm

In [None]:
clf = svm.SVC(kernel="linear", C=0.025)

In [None]:
clf.fit(X, y)

#### Random Forest

In [14]:
from sklearn import ensemble

In [15]:
rf = ensemble.RandomForestClassifier(n_estimators=10)

In [16]:
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

#### KNN

In [11]:
from sklearn import neighbors

In [12]:
knn = neighbors.KNeighborsClassifier(5)

In [13]:
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

#### Decision Trees

In [17]:
from sklearn import tree

In [18]:
dt = tree.DecisionTreeClassifier()

In [19]:
dt.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

#### Gaussian Naive Bayes

In [20]:
from sklearn import naive_bayes

In [21]:
gnb = naive_bayes.GaussianNB()

In [22]:
gnb.fit(X, y)

GaussianNB(priors=None)

#### ADA Boost

In [23]:
ada = ensemble.AdaBoostClassifier()

In [24]:
ada.fit(X, y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

#### MLP

In [25]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X)  
X_scaler = scaler.transform(X)  

In [26]:
from sklearn import neural_network

In [27]:
mlp = neural_network.MLPClassifier(alpha=1)

In [28]:
mlp.fit(X, y)

MLPClassifier(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

### Parse Test Data

In [29]:
test_df["EndDay"] = test_df["EndDay"].map(lambda d: dayDict[d])

In [30]:
X_test = test_df[features]
y_test = test_df["QuantitySold"]

In [31]:
from sklearn import metrics

#### SVM Results

#### Random Forest

In [32]:
rf_predicted = rf.predict(X_test)

In [33]:
print(metrics.classification_report(rf_predicted, y_test))

             precision    recall  f1-score   support

          0       0.91      0.93      0.92     27573
          1       0.80      0.76      0.78      9887

avg / total       0.88      0.88      0.88     37460



#### KNN

In [34]:
knn_predicted = knn.predict(X_test)

In [35]:
print(metrics.classification_report(knn_predicted, y_test))

             precision    recall  f1-score   support

          0       0.74      0.91      0.82     22843
          1       0.78      0.50      0.61     14617

avg / total       0.75      0.75      0.74     37460



#### Decision Tree

In [36]:
dt_predicted = dt.predict(X_test)

In [37]:
print(metrics.classification_report(dt_predicted, y_test))

             precision    recall  f1-score   support

          0       0.84      0.93      0.88     25462
          1       0.80      0.63      0.70     11998

avg / total       0.83      0.83      0.82     37460



#### Gaussian Naive Bayes

In [38]:
gnb_predicted = gnb.predict(X_test)

In [39]:
print(metrics.classification_report(gnb_predicted, y_test))

             precision    recall  f1-score   support

          0       0.19      0.97      0.32      5419
          1       0.99      0.29      0.45     32041

avg / total       0.87      0.39      0.43     37460



#### ADA Boost

In [40]:
ada_predicted = ada.predict(X_test)

In [41]:
print(metrics.classification_report(ada_predicted, y_test))

             precision    recall  f1-score   support

          0       0.91      0.93      0.92     27387
          1       0.81      0.75      0.78     10073

avg / total       0.88      0.89      0.88     37460



#### MLP

In [42]:
X_test_scaler = scaler.transform(X_test)

In [43]:
mlp_predicted = mlp.predict(X_test_scaler)

In [44]:
print(metrics.classification_report(mlp_predicted, y_test))

             precision    recall  f1-score   support

          0       0.99      0.84      0.91     33124
          1       0.44      0.96      0.61      4336

avg / total       0.93      0.86      0.88     37460

