In [92]:
# imports
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [93]:
# reading in the final grailed data
df = pd.read_csv('../Data/Modeling_Data/modeling_data.csv')

In [94]:
# dropping the columns that wont be used in modeling
df = df.drop(columns=['username', 'size_color_cond', 'link'])

In [95]:
df.head()

Unnamed: 0,sold_price,designer,category,description,sub_title,image_count,feedback_count,size,color,condition
0,110.0,Acne Studios,Shorts,Ryder Bermuda wool/mohair blended shorts from ...,Ryder Bermuda Wool Shorts,5,12.0,US 28 / EU 44,Black,New
1,155.0,Acne Studios,Casual Pants,"Size 46, Asia fitting, Waist- around 31-32 Len...",Jager Cord trousers,10,9.0,US 31,Brown,Gently used
2,635.0,Acne Studios,Heavy Coats,- Brand name: Acne Studios - Item name: SS15 D...,Acne Studios SS15 Dark Grey Charlie Jacket,4,97.0,US S / EU 44-46 / 1,Dark grey,Gently used
3,85.0,Acne Studios,Blazers,Great Condition Orange Blazer,Acne Studios Noel Blazer 40R,6,195.0,40R,Orange,Gently used
4,120.0,Acne Studios,Sweaters & Knitwear,Marked as a large fits more like a medium or m...,Black knit Bill o reverse,3,6.0,US M / EU 48-50 / 2,Black,Used


# Changing condition column to a numeric

1. New = 0
2. Gently used = 1
3. Used = 2
4. Worn = 3

In [96]:
cond_dict = {'New':0,'Gently used':1,'Used':2,'Worn':3}

In [97]:
df['condition'] = df['condition'].map(cond_dict)

In [98]:
df.dropna(inplace=True)

# Dummify Designer, Category, Size, and Color

In [99]:
df =pd.get_dummies(df, columns=['designer','category', 'size', 'color'], drop_first=True)

In [100]:
df.head(3)

Unnamed: 0,sold_price,description,sub_title,image_count,feedback_count,condition,designer_(B).Stroy,designer_.925 Silver Produced In Italy,designer_032c,designer_1,...,color_Tan,color_Teal,color_Tie dye,color_Tye dye,color_Washed black,color_Whie,color_White,color_White gold,color_Yellow,color_Yellow gold
0,110.0,Ryder Bermuda wool/mohair blended shorts from ...,Ryder Bermuda Wool Shorts,5,12.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,155.0,"Size 46, Asia fitting, Waist- around 31-32 Len...",Jager Cord trousers,10,9.0,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,635.0,- Brand name: Acne Studios - Item name: SS15 D...,Acne Studios SS15 Dark Grey Charlie Jacket,4,97.0,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,85.0,Great Condition Orange Blazer,Acne Studios Noel Blazer 40R,6,195.0,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,120.0,Marked as a large fits more like a medium or m...,Black knit Bill o reverse,3,6.0,2.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Train test split

In [106]:
# seperating into indepedent and dependent variables
X = df.drop(columns=['sold_price', 'description', 'sub_title'])
y = df['sold_price']

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

exporting for webapp (prob remove

In [108]:
df.to_csv('X_data.csv', index=False)


In [109]:
X_train.columns

Index(['image_count', 'feedback_count', 'condition', 'designer_(B).Stroy',
       'designer_.925 Silver Produced In Italy', 'designer_032c', 'designer_1',
       'designer_10 Deep', 'designer_100 Thieves', 'designer_1017 ALYX 9SM',
       ...
       'color_Tan', 'color_Teal', 'color_Tie dye', 'color_Tye dye',
       'color_Washed black', 'color_Whie', 'color_White', 'color_White gold',
       'color_Yellow', 'color_Yellow gold'],
      dtype='object', length=1121)

# Decision Tree Model

In [110]:
# intiatiating and traing the model
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

DecisionTreeRegressor()

In [113]:
# cross val of MSE
scores = cross_val_score(dt, X, y, cv=5, scoring='neg_mean_squared_error')
scores.mean()

-14727.39755551584

In [111]:
# r2 training score
dt.score(X_train, y_train)

0.9955128497238886

In [112]:
# rs testing score
dt.score(X_test, y_test)

0.07834727875169678

# Random Forest Model

In [147]:
# intiatiating and traing the model
rf = RandomForestRegressor(n_estimators = 20, max_depth = 100)
rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=100, n_estimators=20)

In [148]:
# cross val of MSE
scores = cross_val_score(rf, X, y, cv=5, scoring='neg_mean_squared_error')
scores.mean()

-10825.497567367976

In [149]:
# r2 training score
rf.score(X_train, y_train)

0.889445721589773

In [150]:
# r2 testing score
rf.score(X_test, y_test)

0.4200986710193725

# Adaboost model

In [131]:
# intiatiating and traing the model
ada = AdaBoostRegressor(n_estimators=50)
ada.fit(X_train, y_train)

AdaBoostRegressor()

In [132]:
# cross val of MSE
scores = cross_val_score(ada, X_test, y_test, cv=5, scoring='neg_mean_squared_error')
scores.mean()

-21679.46587290589

In [133]:
# r2 training score
ada.score(X_train, y_train)

-0.6764030205059925

In [134]:
# r2 testing score
ada.score(X_test, y_test)

-0.7309151867314589

# Gradient boost model

In [139]:
# intiatiating and traing the model
gb = GradientBoostingRegressor(n_estimators=10,max_depth=100)
gb.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=100, n_estimators=10)

In [140]:
# cross val of MSE
scores = cross_val_score(ada, X_test, y_test, cv=5, scoring='neg_mean_squared_error')
scores.mean()

-27879.025404805987

In [141]:
# r2 training score
gb.score(X_train, y_train)

0.8532445450131451

In [142]:
# r2 testing score
gb.score(X_test, y_test)

0.3231672567627303

# Exporting the models

In [151]:
with open('grailed_model_rf.pkl', 'wb') as file:
    pickle.dump(rf, file)

In [153]:
with open('grailed_model_dt.pkl', 'wb') as file:
    pickle.dump(dt, file)

In [154]:
with open('grailed_model_ada.pkl', 'wb') as file:
    pickle.dump(ada, file)

In [155]:
with open('grailed_model_gb.pkl', 'wb') as file:
    pickle.dump(gb, file)

In [152]:
model_columns = list(X.columns)
with open('model_columns.pkl', 'wb') as file:
    pickle.dump(model_columns, file)