In [8]:
import sklearn
import numpy as np
import pandas as pd
import collections

### Import Train and Test data

In [9]:
train_file = "data/TrainingSet.csv"
test_file = "data/TestSet.csv"

In [10]:
train_df = pd.read_csv(train_file, header = 0)
test_df = pd.read_csv(test_file, header = 0)

In [11]:
features = ["SellerClosePercent","Category","AvgPrice","EndDay","HitCount","AuctionAvgHitCount","SellerSaleAvgPriceRatio","SellerAvg","SellerItemAvg","AuctionHitCountAvgRatio","IsHOF","AuctionCount","AuctionSaleCount","SellerAuctionCount","SellerAuctionSaleCount","AuctionMedianPrice"]

### Parse out train data


In [12]:
import math
dayDict = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6}
train_df["EndDay"] = train_df["EndDay"].map(lambda d: dayDict[d])

In [13]:
sold_df = train_df[train_df.QuantitySold == 1]

In [14]:
X = sold_df[features]
y = sold_df["StartingBid"]

### Build model


#### Linear Regression

In [15]:
from sklearn import linear_model

In [16]:
lr = linear_model.LinearRegression()

In [17]:
lr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

#### Ridge

In [18]:
ridge = linear_model.Ridge (alpha = .5)

In [19]:
ridge.fit(X, y)

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

#### RidgeCV

In [20]:
ridgeCV = linear_model.RidgeCV (alphas=[0.1, 1.0, 10.0])

In [21]:
ridgeCV.fit(X, y)

RidgeCV(alphas=[0.1, 1.0, 10.0], cv=None, fit_intercept=True, gcv_mode=None,
    normalize=False, scoring=None, store_cv_values=False)

#### Decision Tree Regression

In [22]:
from sklearn import tree 

In [23]:
dtr = tree.DecisionTreeRegressor()

In [24]:
dtr.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

#### MLPRegressor

In [25]:
from sklearn import neural_network

In [26]:
mlpr = neural_network.MLPRegressor()

In [27]:
mlpr.fit(X, y)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

#### KNN Regressor

In [28]:
from sklearn import neighbors

In [29]:
knnr = neighbors.KNeighborsRegressor()

In [30]:
knnr.fit(X, y)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

#### GradientBoostingRegressor

In [31]:
from sklearn import ensemble

In [32]:
gbr = ensemble.GradientBoostingRegressor()

In [33]:
gbr.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

#### ADA Boost Regressor

In [34]:
ada = ensemble.AdaBoostRegressor()

In [35]:
ada.fit(X, y)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None)

#### Bagging Regressor

In [85]:
br = ensemble.BaggingRegressor(n_estimators=15)

In [86]:
br.fit(X, y)

BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=15, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

#### Random Forest Regressor

In [38]:
from sklearn.ensemble import RandomForestRegressor

In [73]:
rfr = RandomForestRegressor(n_estimators=20, max_features=10)

In [74]:
rfr.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=10, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

### Parse Test Data

In [42]:
test_df["EndDay"] = test_df["EndDay"].map(lambda d: dayDict[d])

In [43]:
sold_test_df = test_df[test_df.QuantitySold == 1]

In [44]:
X_test = sold_test_df[features]
y_test = sold_test_df["StartingBid"]

In [45]:
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

#### Linear Regression

In [46]:
lr_predicted = lr.predict(X_test)

In [47]:
print(metrics.r2_score(lr_predicted, y_test))

-0.767185306395


In [48]:
print(metrics.median_absolute_error(lr_predicted, y_test))

7.46712200185


In [49]:
print(metrics.mean_squared_error(lr_predicted, y_test))

374.559948541


In [50]:
print(metrics.mean_absolute_error(lr_predicted, y_test))

11.2277634175


#### Ridge

In [51]:
ridge_predicted = ridge.predict(X_test)

In [52]:
print(metrics.r2_score(ridge_predicted, y_test))

-0.767236933279


In [53]:
print(metrics.median_absolute_error(ridge_predicted, y_test))

7.46763000763


In [54]:
print(metrics.mean_squared_error(ridge_predicted, y_test))

374.559510372


#### RidgeCV

In [55]:
ridgeCV_predicted = ridgeCV.predict(X_test)

In [56]:
print(metrics.r2_score(ridgeCV_predicted, y_test))

-0.790130899859


In [57]:
print(metrics.median_absolute_error(ridgeCV_predicted, y_test))

7.49506326442


In [58]:
print(metrics.mean_squared_error(ridgeCV_predicted, y_test))

374.425011202


#### Decision Tree Regression

In [59]:
dtr_predicted = dtr.predict(X_test)

In [60]:
print(metrics.r2_score(dtr_predicted, y_test))

0.275081026888


#### MLP Regressor

In [61]:
mlpr_predicted = mlpr.predict(X_test)

In [62]:
print(metrics.r2_score(mlpr_predicted, y_test))

-1.38987182246


#### KNN Regressor

In [63]:
knnr_predicted = knnr.predict(X_test)

In [64]:
print(metrics.r2_score(knnr_predicted, y_test))

-0.0387642648773


#### GradientBoostingRegressor

In [65]:
gbr_predicted = gbr.predict(X_test)

In [66]:
print(metrics.r2_score(gbr_predicted, y_test))

0.267836115032


#### ADA Boost Regressor

In [67]:
ada_predicted = ada.predict(X_test)

In [68]:
print(metrics.r2_score(ada_predicted, y_test))

0.0762133541577


#### Bagging Regressor

In [87]:
br_predicted = br.predict(X_test)

In [88]:
print(metrics.r2_score(br_predicted, y_test))

0.460952854603


#### Random Forest Regressor

In [75]:
rfr_predicted = rfr.predict(X_test)

In [76]:
print(metrics.r2_score(rfr_predicted, y_test))

0.452975462091
