# Rent the Runway: Modeling 

### Import Libraries and Data

In [1]:
#Data cleaning
import numpy as np 
import pandas as pd

#Modeling 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn import svm

from imblearn.over_sampling import SMOTE

np.random.seed(42)

#Warnings - to keep the notebook clean 
import warnings 
warnings.filterwarnings('ignore')
pd.options.display.max_columns= 5000



In [2]:
#Import the cleaned dataframe from the firt notebook 
df = pd.read_csv('./rent_the_runway_cleaned_with_dummies.csv')
df.head()

Unnamed: 0,age,bust_size,fit,height,item_id,rating,review_date,review_summary,review_text,size,user_id,weight,body_type_apple,body_type_athletic,body_type_full_bust,body_type_hourglass,body_type_pear,body_type_petite,body_type_straight_narrow,rented_for_date,rented_for_everyday,rented_for_formal_affair,rented_for_other,rented_for_party,rented_for_vacation,rented_for_wedding,rented_for_work,category_blazer,category_blouse,category_bomber,category_cami,category_cape,category_cardigan,category_coat,category_crewneck,category_culottes,category_down,category_dress,category_duster,category_frock,category_gown,category_henley,category_hoodie,category_jacket,category_jogger,category_jumpsuit,category_kaftan,category_kimono,category_knit,category_leggings,category_maxi,category_midi,category_mini,category_overalls,category_pants,category_parka,category_peacoat,category_poncho,category_print,category_pullover,category_romper,category_sheath,category_shift,category_shirt,category_shirtdress,category_skirt,category_suit,category_sweater,category_sweatpants,category_sweatshirt,category_tank,category_tee,category_tight,category_top,category_trench,category_trousers,category_tunic,category_turtleneck,category_vest
0,25,34c,0,65.0,937638,8,2017-01-29,"Colorful, unique dress for a formal event",I'm glad I got the backup size of 10 because i...,20,402340,145,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,26,32b,1,66.0,604383,10,2016-10-27,Stylist Review,You can't help but feel powerful in this numbe...,24,380920,160,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,38,32a,2,63.0,1505709,10,2017-05-22,"Easy to Wear, not much fuss",This runs big but it is still nice. The mater...,4,234255,108,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,29,34a,1,61.0,127495,10,2014-11-05,Rented for my best friends bachelorette party....,The dress was perfect for fall/ winter weather...,4,763040,110,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,35,38d,1,65.0,1001785,10,2015-09-30,This dress was perfect for Diner en Blanc. It ...,"It was a tad bit snug around the arms, but I h...",32,698342,220,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Pre- Modeling

In [3]:
#Gather all numeric columns excluding fit 
numeric_cols = df.select_dtypes(include=['int64', 'float64']).drop(columns= 'fit')

In [4]:
#Define X & Y 
X = numeric_cols
y = df['fit']

In [5]:
#Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state =42, 
                                                    stratify = y)

In [6]:
# Check the baseline 
# We can see that we also have unbalanced classes 

y.value_counts(normalize = True )

1    0.734124
0    0.136681
2    0.129195
Name: fit, dtype: float64

In [7]:
# Function to run different model types

def fit_model(X, y, model_name='lr', model_type=LogisticRegression()):
    
    # Pipeline for pre-processing
    pipe = Pipeline([
        ('scale', StandardScaler()),
        (model_name, model_type)
    ])
    # Fit the model 
    model = pipe.fit(X, y)
    
    #Score
    score = model.score(X, y)
                   
    # Print attributes
    print(f"For model: {model_type}")
#     print(f"Score: {score}")
    
    return model

In [8]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Evaluate training data
    preds_train = model.predict(X_train)
    score_train = model.score(X_train, y_train)

    # Evaluate testing data
    preds_test = model.predict(X_test)
    score_test = model.score(X_test, y_test)
    
    # Print results
    
    print(f'Train Score: {score_train:.4f}')
    print(f'Test Score: {score_test:.4f}')

# Models

In [9]:
# Logistic Regression
# Score very similar to the baseline, not overfit  
logreg = fit_model(X_train, y_train, model_name='logreg', model_type=LogisticRegression())

evaluate_model(logreg, X_train, X_test, y_train, y_test)

For model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
Train Score: 0.7368
Test Score: 0.7344


In [10]:
# KNN
# Score below baseline and overfit  
knn = fit_model(X_train, y_train, model_name='knn', model_type=KNeighborsClassifier())

evaluate_model(knn, X_train, X_test, y_train, y_test)

For model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Train Score: 0.7656
Test Score: 0.6939


In [11]:
# Random Forests
# Very overfit 
rf = fit_model(X_train, y_train, model_name='rf', model_type=RandomForestClassifier())

evaluate_model(rf, X_train, X_test, y_train, y_test)

For model: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Train Score: 0.9814
Test Score: 0.7079


In [12]:
# AdaBoost
# Similar score to baseline, not overfit  
ada = fit_model(X_train, y_train, model_name='ada', model_type=AdaBoostClassifier())

evaluate_model(ada, X_train, X_test, y_train, y_test)

For model: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)
Train Score: 0.7369
Test Score: 0.7349


In [13]:
# Bagging Classifier
# Very overfit 
bag = fit_model(X_train, y_train, model_name='bag', model_type=BaggingClassifier())

evaluate_model(bag, X_train, X_test, y_train, y_test)

For model: BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False)
Train Score: 0.9819
Test Score: 0.7033


In [14]:
# SVM
# Similar to baseline
svm = fit_model(X_train, y_train, model_name='svm', model_type=svm.SVC())

evaluate_model(svm, X_train, X_test, y_train, y_test)

For model: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
Train Score: 0.7363
Test Score: 0.7341


# GridSearch through Best Model 

In this case, Random Forests train score was strongl. I will gridsearch to improve the test score and minimize the overfitness. 

In [15]:
#Scale the Data (standard scaler was previously in the pipeline)
ss = StandardScaler()                       
ss.fit(X_train)                            
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [16]:
#Random Forest
rf = RandomForestClassifier(random_state=42)

#Indicate the gridearch params 
rf_params = {'bootstrap': [True], 
             'max_depth': [10], 
             'max_features':['auto'], 
             'min_samples_leaf': [1], 
             'min_samples_split': [10],
             'n_estimators': [750] , 
             'n_jobs':[-2]}

gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train_sc, y_train)
print(gs.best_score_)        
gs.best_params_

0.7344483434352641


{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 750,
 'n_jobs': -2}

In [17]:
print(gs.score(X_train_sc, y_train))
print(gs.score(X_test_sc, y_test))
print(cross_val_score(rf, X_train_sc, y_train, cv=5).mean())

0.7348129243950234
0.7339712918660287
0.7127558623437595


# SMOTE

Try to use smote here because the classes were unbalanced. Only the random forests performed well so I'm curious if smote will help. 

In [18]:
# Code from kaggle
# https://www.kaggle.com/qianchao/smote-with-imbalance-data

#Gradient boosting
#https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

In [19]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)

X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

In [20]:
print("Before OverSampling, counts of label '2': {}".format(sum(y_train==2)))
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '2': {}".format(sum(y_train_res==2)))
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

Before OverSampling, counts of label '2': 2835
Before OverSampling, counts of label '1': 16109
Before OverSampling, counts of label '0': 2999 

After OverSampling, the shape of train_X: (48327, 74)
After OverSampling, the shape of train_y: (48327,) 

After OverSampling, counts of label '2': 16109
After OverSampling, counts of label '1': 16109
After OverSampling, counts of label '0': 16109


**Re-fit on Gradient Boost Model to see if it improves the score** 

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

In [22]:
#Gradient Boosting
# Accuracy for GB is not that strong - need to explore AUC ROC

gb = fit_model(X_train_res, y_train_res, model_name='gb', model_type= GradientBoostingClassifier())

evaluate_model(gb, X_train_res, X_test, y_train_res, y_test)

For model: GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
Train Score: 0.7440
Test Score: 0.7289


# Evaluate Predictions 

In [23]:
#Predictions 
preds_gb = gb.predict(X_test)

Since the classes were unbalanced, I want to use AUC ROC score rather than accuracy. However there is no Multiclass AUC ROC score. I found this medium article: 
https://medium.com/@plog397/auc-roc-curve-scoring-function-for-multi-class-classification-9822871a6659

In [24]:
#AUC ROC score for multiclass classification

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer

def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

In [25]:
multiclass_roc_auc_score(y_test, preds_gb)

0.548574374292707

In [26]:
# Create DataFrame with column for predicted values
results = pd.DataFrame({'predicted': preds_gb, 
                        'actual': y_test})

#Find the rows with incorrect predictions 
misclass = results[results['predicted'] != results['actual']]
print(misclass.shape)
misclass.head()

(1983, 2)


Unnamed: 0,predicted,actual
7537,1,2
1140,1,2
27324,1,2
25527,0,2
9556,1,2


In [27]:
#Confusion Matrix 
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, 
                 preds_gb) 

#Here we can see the misclassified predictions 

array([[ 133,  826,   41],
       [ 161, 5101,  108],
       [  39,  808,   98]])

#### GridSearch through GradientBoost

Commenting out the below because my computer can't handle the gridsearch 

In [28]:
#Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)

#Indicate the gridearch params 
gb_params = {'learning_rate': [0.2],
             'max_depth': [3], 
             'max_features' : ['auto'],
             'min_samples_leaf': [3],
             'min_samples_split': [1.0], 
             'n_estimators': [300],
             'random_state' : [42]}

gs = GridSearchCV(gb, param_grid=gb_params, cv=5)
gs.fit(X_train_sc, y_train)
print(gs.best_score_)        
gs.best_params_

0.7372738458733993


{'learning_rate': 0.2,
 'max_depth': 3,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'min_samples_split': 1.0,
 'n_estimators': 300,
 'random_state': 42}

In [29]:
print(gs.score(X_train_sc, y_train))
print(gs.score(X_test_sc, y_test))
print(cross_val_score(rf, X_train_sc, y_train, cv=5).mean())

0.7382764435127376
0.7360218728639781
0.7127558623437595
