In [1]:
import pandas as pd
import scipy.sparse
import scipy
import scipy.io
import numpy as np
from sklearn.decomposition import PCA
from sklearn.utils import resample
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.manifold import TSNE
from sklearn.utils import resample
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
sns.set()

In [2]:
train_clean = pd.read_csv('Data/train_clean.tsv', sep='\t', header=0)

#splitting train_clean into train_base and test_base as our test.tsv has no price labels for us to check against
train_base = train_clean.iloc[:1000000,:]
test_base = train_clean.iloc[1000000:,:]

In [3]:
#Baseline Prediction: get the mean values per category_name_1
mean_values = train_base.groupby('category_name_1')['price'].mean()
df_means = pd.DataFrame(mean_values)
df_means.reset_index(inplace=True)
df_means.rename(columns={'price':'prediction'}, inplace=True)



In [4]:
#testFile = pd.read_csv('Data/test.tsv', sep='\t', header=0)

#split by category_name
#df.join(df['AB'].str.split('-', 1, expand=True).rename(columns={0:'A', 1:'B'}))
#testFile['category_name'].str.split('/',2, expand=True).rename(columns={0:'category_1', 1:'category_2', 2:'category_3'})

In [5]:
#import train_term_matrix
train_sparse = scipy.io.mmread('Data/train_term_matrix_pruned.mtx')
y = train_clean['price']

In [6]:
# Sample Data to reduce the size
numSamples = 100000
train_sparse_sample, y_sample = resample(train_sparse, y, 
                               n_samples=numSamples,
                               random_state=777)

In [7]:
print('sparse shape', y_sample.shape)

sparse shape (100000,)


In [8]:
from sklearn.model_selection import train_test_split


XTrain, XTest, yTrain, yTest = train_test_split(train_sparse_sample, y_sample,
                                                test_size=0.33, random_state=95828)

## Baseline Model - Use average of category_name_1 products

In [9]:
from sklearn.metrics import mean_squared_error

#merge with df_means to get mean 'prediction'
test_base_pred = test_base.merge(df_means, how='left', on='category_name_1')

#remove rows with NaN in category_1
test_base_pred = test_base_pred[test_base_pred.category_name_1.notnull()]

mse_base = mean_squared_error(test_base_pred['price'], test_base_pred['prediction'])
print('Baseline model MSE = {:.2f}'.format(mse_base))


Baseline model MSE = 1450.86


## Random Forest Regression

In [None]:
%%time
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

model_mse = []

rf = RandomForestRegressor(random_state=0)
pred_rf = rf.fit(XTrain, yTrain).predict(XTest)

mse_rf = mean_squared_error(yTest, pred_rf)
print('RandomForest Regressor: MSE = {:.2f}'.format(mse_rf))
model_mse.append(['RandomForest', round(mse_rf,2)])

#add GridSearch later
# params_maxDepth = {'max_depth':[2,5,10,15,20]}
# clf = GridSearchCV(treeReg, params_maxDepth)
# clf.fit(XTrain, yTrain)

In [None]:
plt.figure(figsize=(12,10))
plt.scatter(yTest, pred_rf, c="g")
plt.ylabel("Prediction Price")
plt.ylim(ymax = 500)
plt.xlim(xmax = 500)
plt.xlabel("True Price")
plt.title("RandomForest Regression: Prediction vs True Prices");

## AdaBoost Regression

In [None]:
%%time
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor(DecisionTreeRegressor(), random_state=100)
pred_ada = ada.fit(XTrain, yTrain).predict(XTest)

mse_ada = mean_squared_error(yTest, pred_ada)
print('AdaBoost Regresion: MSE = {:.2f}'.format(mse_ada))
model_mse.append(['AdaBoost', round(mse_ada,2)])

In [None]:
plt.figure(figsize=(12,10))
plt.scatter(yTest, pred_ada, c="r")
plt.ylabel("Prediction Price")
plt.ylim(ymax = 500)
plt.xlim(xmax = 500)
plt.xlabel("True Price")
plt.title("AdaBoost Regression: Prediction vs True Prices");

## Ridge Regression

In [None]:
%%time
from sklearn.linear_model import Ridge

#add in gridsearch later
ridge = Ridge(alpha=1.0)
pred_ridge = ridge.fit(XTrain, yTrain).predict(XTest)

mse_ridge = mean_squared_error(yTest, pred_ridge)
print('Ridge Regression: MSE = {:.2f}'.format(mse_ridge))
model_mse.append(['Ridge', round(mse_ridge,2)])

In [None]:
plt.figure(figsize=(12,10))
plt.scatter(yTest, pred_ridge, c="c")
plt.ylabel("Prediction Price")
plt.ylim(ymax = 500)
plt.xlim(xmax = 500)
plt.xlabel("True Price")
plt.title("Ridge Regression: Prediction vs True Prices");

## Lasso Regression

In [None]:
%%time
from sklearn.linear_model import Lasso

#add in gridsearch later
lasso = Lasso(alpha=1.0)
pred_lasso = lasso.fit(XTrain, yTrain).predict(XTest)

mse_lasso = mean_squared_error(yTest, pred_lasso)
print('Lasso Regression: MSE = {:.2f}'.format(mse_lasso))
model_mse.append(['Lasso', round(mse_lasso,2)])

In [None]:
plt.figure(figsize=(12,10))
plt.scatter(yTest, pred_lasso, c="m")
plt.ylabel("Prediction Price")
plt.ylim(ymax = 500)
plt.xlim(xmax = 500)
plt.xlabel("True Price")
plt.title("Lasso Regression: Prediction vs True Prices");

In [None]:
from sklearn.neighbors import KNeighborsRegressor

lasso = Lasso(alpha=1.0)
pred_lasso = lasso.fit(XTrain, yTrain).predict(XTest)

mse_lasso = mean_squared_error(yTest, pred_lasso)
print('Lasso Regression: MSE = {:.2f}'.format(mse_lasso))
model_mse.append(['Lasso', round(mse_lasso,2)])


## Regression Models that take sparse matrices (

BaggingRegressor
ExtraTreesRegressor
KNeighborsRegressor
SGDRegressor

ElasticNet
PassiveAggressiveRegressor
RANSACRegressor
NuSVR
LinearSVR
RadiusNeighborsRegressor
MultiOutputRegressor
MLPRegressor