In [1]:
#Import libraries
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

In [49]:
#Load the dataset
TrainingSubset = pd.read_csv('TrainingSubset.csv')
TestSubset = pd.read_csv('TestSubset.csv')

In [50]:
#Prints the name of features in the dataset
TrainingSubset.keys()

Index(['EbayID', 'Price', 'PricePercent', 'StartingBidPercent', 'SellerName',
       'SellerClosePercent', 'Category', 'PersonID', 'StartingBid', 'AvgPrice',
       'EndDay', 'QuantitySold', 'HitCount', 'AuctionAvgHitCount',
       'Authenticated', 'ItemAuctionSellPercent', 'SellerSaleAvgPriceRatio',
       'SellerAvg', 'SellerItemAvg', 'AuctionHitCountAvgRatio', 'BestOffer',
       'ReturnsAccepted', 'IsHOF', 'BidCount', 'AuctionCount',
       'AuctionSaleCount', 'SellerAuctionCount', 'SellerAuctionSaleCount',
       'PriceBuckets', 'AuctionMedianPrice', 'IsInMedianRatio10Percent',
       'IsInMedianRatio20Percent', 'IsInMedianRatio25Percent'],
      dtype='object')

In [51]:
#Preprocessing Step 1:
#Check if Training and test set has any null values or not
print(TrainingSubset.isnull().values.any())
print(TestSubset.isnull().values.any())

False
False


In [52]:
#Apply correlation to data to check relation between different columns
correlation = TrainingSubset.corr()
correlation

Unnamed: 0,EbayID,Price,PricePercent,StartingBidPercent,SellerClosePercent,Category,PersonID,StartingBid,AvgPrice,QuantitySold,...,BidCount,AuctionCount,AuctionSaleCount,SellerAuctionCount,SellerAuctionSaleCount,PriceBuckets,AuctionMedianPrice,IsInMedianRatio10Percent,IsInMedianRatio20Percent,IsInMedianRatio25Percent
EbayID,1.0,-0.036338,-0.008785,-0.049561,0.059759,0.059237,-0.009886,-0.094197,-0.043759,,...,0.011477,-0.048974,-0.053145,0.21209,0.284548,-0.032968,-0.049247,0.045638,0.038959,0.032989
Price,-0.036338,1.0,0.357756,0.009331,0.176138,-0.293678,-0.003798,0.502391,0.54513,,...,0.555265,0.327298,0.38764,-0.108176,-0.051832,0.999384,0.738861,-0.095621,-0.050834,-0.03161
PricePercent,-0.008785,0.357756,1.0,0.548447,0.006574,0.030201,-0.011015,0.228175,-0.047523,,...,0.1774,0.009407,0.019033,-0.014609,-0.005667,0.354421,-0.023523,-0.086084,-0.109955,-0.115943
StartingBidPercent,-0.049561,0.009331,0.548447,1.0,-0.400588,0.092573,0.00112,0.473224,-0.150035,,...,-0.387568,-0.122299,-0.137124,0.032337,-0.106317,0.000567,-0.174997,0.029182,-0.014494,-0.027047
SellerClosePercent,0.059759,0.176138,0.006574,-0.400588,1.0,-0.125167,-0.018555,-0.220318,0.126039,,...,0.43525,0.115372,0.134065,-0.147609,0.182871,0.18307,0.166965,-0.061136,-0.019776,-0.008131
Category,0.059237,-0.293678,0.030201,0.092573,-0.125167,1.0,0.025451,-0.170102,-0.239251,,...,-0.239812,-0.250459,-0.2038,0.088443,0.051117,-0.289436,-0.321772,0.033264,-0.015125,-0.034944
PersonID,-0.009886,-0.003798,-0.011015,0.00112,-0.018555,0.025451,1.0,0.007768,0.001711,,...,-0.009943,-0.111589,-0.097379,-0.012452,-0.035363,-0.004331,0.001896,0.00674,0.001367,0.001436
StartingBid,-0.094197,0.502391,0.228175,0.473224,-0.220318,-0.170102,0.007768,1.0,0.270714,,...,-0.155127,0.150264,0.164567,-0.081156,-0.151812,0.49507,0.356913,-0.065021,-0.055186,-0.048291
AvgPrice,-0.043759,0.54513,-0.047523,-0.150035,0.126039,-0.239251,0.001711,0.270714,1.0,,...,0.343542,0.244002,0.290694,-0.089905,-0.063089,0.544804,0.72147,-0.056497,-0.0148,-0.000746
QuantitySold,,,,,,,,,,,...,,,,,,,,,,


In [53]:
#Training data drop QuantitySold, SellerName,EndDay  column
trainData = TrainingSubset.drop(['Price','SellerName','EndDay'],axis=1)
testData = TestSubset.drop(['Price','SellerName','EndDay'],axis=1)
#Target Data has QuantitySold column
targetTrainData = TrainingSubset['Price'] 
targetTestData = TestSubset['Price'] 

In [54]:
#Preprocessing Step 2
#Standardize and scale the data
from sklearn import preprocessing
trainDataScale = preprocessing.scale(trainData)
testDataScale = preprocessing.scale(testData)

  after removing the cwd from sys.path.
  """


In [55]:
#Preprocessing Step 3
#Apply lasso to see the important features
from sklearn import linear_model
lasso = linear_model.Lasso(alpha=0.1)
lasso.fit(trainDataScale,targetTrainData)
print(lasso.coef_)
print(trainData.keys())

[-7.63145752e-03  4.30096516e-02  1.56838553e-01 -6.16585974e-02
 -1.43837032e-01  0.00000000e+00  2.57748797e-01  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.97806972e-03  2.07251092e-02  5.56286524e-03  3.66406078e-02
  0.00000000e+00  0.00000000e+00 -0.00000000e+00  1.21148921e-03
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -2.73656548e-02  4.52550020e+01  8.39341767e-02 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00]
Index(['EbayID', 'PricePercent', 'StartingBidPercent', 'SellerClosePercent',
       'Category', 'PersonID', 'StartingBid', 'AvgPrice', 'QuantitySold',
       'HitCount', 'AuctionAvgHitCount', 'Authenticated',
       'ItemAuctionSellPercent', 'SellerSaleAvgPriceRatio', 'SellerAvg',
       'SellerItemAvg', 'AuctionHitCountAvgRatio', 'BestOffer',
       'ReturnsAccepted', 'IsHOF', 'BidCount', 'AuctionCount',
       'AuctionSaleCount', 'SellerAuctionCount', 'SellerAuctionSaleCount',
       'PriceBuckets', 'Auc

In [56]:
#Choose the important features only i.e. 'SellerClosePercent','HitCount', 'BestOffer' and again standardize
trainData=TrainingSubset[['EbayID', 'PricePercent', 'StartingBidPercent', 'SellerClosePercent',
       'Category','StartingBid','ItemAuctionSellPercent', 'SellerSaleAvgPriceRatio', 'SellerAvg',
       'SellerItemAvg','IsHOF','SellerAuctionSaleCount',
       'PriceBuckets', 'AuctionMedianPrice']]
testData=TestSubset[['EbayID', 'PricePercent', 'StartingBidPercent', 'SellerClosePercent',
       'Category','StartingBid','ItemAuctionSellPercent', 'SellerSaleAvgPriceRatio', 'SellerAvg',
       'SellerItemAvg','IsHOF','SellerAuctionSaleCount',
       'PriceBuckets', 'AuctionMedianPrice']]
trainDataScale = preprocessing.scale(trainData)
testDataScale = preprocessing.scale(testData)

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


In [57]:
#Do Regression using various models
print('Regression Techniques :')
#Regression Technique 1
print('Regression Technique 1 : ')
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
#Create linear regression object
LinearRegression = linear_model.LinearRegression()
#Train the model 
LinearRegression.fit(trainDataScale,targetTrainData)
#Predict on test data
PredictLinearRegression = LinearRegression.predict(testDataScale)
#LinearRegression coefficients
print('LinearRegression Coefficients: \n', LinearRegression.coef_)
#LinearRegression mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(targetTestData, PredictLinearRegression))
# PredictLinearRegressionExplained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(targetTestData,PredictLinearRegression))
#PredictLinearRegression

Regression Techniques :
Regression Technique 1 : 
LinearRegression Coefficients: 
 [-7.68192129e-02  1.03767765e-01  2.09450535e-01 -1.86675270e-01
 -2.30678899e-01  2.22106796e-01  1.02596747e-01  8.77917332e-02
 -1.89775115e-02  8.74316688e-02  8.60259465e-02 -5.91644485e-02
  4.52264401e+01  1.47305810e-01]
Mean squared error: 33.76
Variance score: 0.98
