# Rent the Runway: Modeling 

### Import Libraries and Data

In [1]:
#Data cleaning
import numpy as np 
import pandas as pd

#Modeling 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm

np.random.seed(42)

#Warnings - to keep the notebook clean 
import warnings 
warnings.filterwarnings('ignore')
pd.options.display.max_columns= 5000

In [2]:
df = pd.read_csv('./rent_the_runway_cleaned_with_dummies.csv')
df.head()

Unnamed: 0,age,bust_size,fit,height,item_id,rating,review_date,review_summary,review_text,size,user_id,weight,body_type_apple,body_type_athletic,body_type_full bust,body_type_hourglass,body_type_pear,body_type_petite,body_type_straight & narrow,rented_for_date,rented_for_everyday,rented_for_formal affair,rented_for_other,rented_for_party,rented_for_vacation,rented_for_wedding,rented_for_work,category_ballgown,category_blazer,category_blouse,category_blouson,category_bomber,category_caftan,category_cami,category_cape,category_cardigan,category_coat,category_combo,category_culotte,category_culottes,category_down,category_dress,category_duster,category_frock,category_gown,category_henley,category_hoodie,category_jacket,category_jeans,category_jumpsuit,category_kaftan,category_kimono,category_knit,category_legging,category_leggings,category_maxi,category_midi,category_mini,category_overalls,category_pant,category_pants,category_parka,category_peacoat,category_poncho,category_print,category_pullover,category_romper,category_sheath,category_shift,category_shirt,category_shirtdress,category_skirt,category_skirts,category_skort,category_suit,category_sweater,category_sweatershirt,category_sweatshirt,category_t-shirt,category_tank,category_tee,category_tight,category_top,category_trench,category_trouser,category_trousers,category_tunic,category_turtleneck,category_vest
0,36,34d,1,5. 5,815826,10,2017-09-18,"Good fit, great style, comfortable yet elegant",Rented for early brunch/garden wedding. My to...,20,334577,137,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,34,34c,1,5. 5,1636171,10,2017-04-19,Love the fit and fabric!,This dress was perfect. The fabric is thick an...,8,634115,125,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,30,34a,0,5. 8,438881,10,2017-10-18,"Simple black dress, loved the ruffles.",Wore this dress to a Naval Ball. The dress did...,13,988705,124,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,32,34d,1,5. 8,1392841,10,2017-08-07,One of my fav rentals,Loved this fun dress. The low V in the front m...,16,977884,158,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,34,34a,2,5. 7,160612,8,2015-01-05,Pretty but not meant for small chested girls,"Pretty dress, but I didn't have the bust to fi...",8,795673,135,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Pre- Modeling

In [3]:
#Gather all numeric columns excluding fit 
numeric_cols = df.select_dtypes(include=['int64']).drop(columns= 'fit')

In [4]:
#Define X & Y 
X = numeric_cols
y = df['fit']

In [5]:
#Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state =42, 
                                                    stratify = y)

In [7]:
# Check the baseline 
# We can see that we also have unbalanced classes 

y.value_counts(normalize = True )

1    0.735927
0    0.134513
2    0.129560
Name: fit, dtype: float64

In [8]:
# Function to run different model types

def fit_and_evaluate(X, y, model_name='lr', model_type=LogisticRegression()):
    
    # Pipeline for pre-processing
    pipe = Pipeline([
        ('scale', StandardScaler()),
        (model_name, model_type)
    ])
    # Fit the model 
    model = pipe.fit(X, y)
    
    #Score
    score = model.score(X, y)
                   
    # Print attributes
    print(f"For model: {model_type}")
    print(f"Score: {score}")
    
    return model

# Models

In [9]:
# Logistic Regression
# Score very similar to the baseline 
logreg = fit_and_evaluate(X_train, y_train, model_name='logreg', model_type=LogisticRegression())

logreg = fit_and_evaluate(X_test, y_test, model_name='logreg', model_type=LogisticRegression())

For model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Score: 0.7376690804754747
For model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Score: 0.7401284328460173


In [10]:
# KNN
# Score slightly higher than the baseline 
knn = fit_and_evaluate(X_train, y_train, model_name='knn', model_type=KNeighborsClassifier())

knn = fit_and_evaluate(X_test, y_test, model_name='knn', model_type=KNeighborsClassifier())

For model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
Score: 0.7650407614883636
For model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
Score: 0.7626724962426561


In [11]:
# Random Forests
# Good score!
rf = fit_and_evaluate(X_train, y_train, model_name='rf', model_type=RandomForestClassifier())

rf = fit_and_evaluate(X_test, y_test, model_name='rf', model_type=RandomForestClassifier())

For model: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Score: 0.9816459443457667
For model: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Score: 0.9816914879081842


In [12]:
# AdaBoost
# Similar score to baseline 
ada = fit_and_evaluate(X_train, y_train, model_name='ada', model_type=AdaBoostClassifier())

ada = fit_and_evaluate(X_test, y_test, model_name='ada', model_type=AdaBoostClassifier())

For model: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Score: 0.7383066903493192
For model: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Score: 0.7393086487225031


In [13]:
# Bagging Classifier
# Good Scores! 
bag = fit_and_evaluate(X_train, y_train, model_name='bag', model_type=BaggingClassifier())

bag = fit_and_evaluate(X_test, y_test, model_name='bag', model_type=BaggingClassifier())

For model: BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_start=False)
Score: 0.9808261602222526
For model: BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_start=False)
Score: 0.9784123514141276


# GridSearch through Best Model 

In this case, Random Forests did very well. I will gridsearch but there may be little room to improve the score.

In [14]:
#Scale the Data (standard scaler was previously in the pipeline)
ss = StandardScaler()                       
ss.fit(X_train)                            
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [None]:
# #Random Forest
# rf = RandomForestClassifier(random_state=42)

# #Indicate the gridearch params 
# rf_params = {'bootstrap': [True, False], 
#              'max_depth': [10,25,50], 
#              'max_features':['auto'], 
#              'max_leaf_nodes':[ None],
#              'min_impurity_split': [None],
#              'min_samples_leaf': [1, 2, 4], 
#              'min_samples_split': [2, 5, 10],
#              'n_estimators': [100, 300] , 
#              'n_jobs':[-2]}

# gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
# gs.fit(X_train_sc, y_train)
# print(gs.best_score_)        
# gs.best_params_

In [None]:
# print(gs.score(X_train_sc, y_train))
# print(gs.score(X_test_sc, y_test))
# print(cross_val_score(rf, X_train_sc, y_train, cv=5).mean())

# SMOTE

Try to use smote here because the classes were unbalanced. Only the random forests performed well so I'm curious if smote will help. 

In [None]:
# Code from kaggle
# https://www.kaggle.com/qianchao/smote-with-imbalance-data

In [15]:
from imblearn.over_sampling import SMOTE

In [16]:
print("Before OverSampling, counts of label '2': {}".format(sum(y_train==2)))
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2)

X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '2': {}".format(sum(y_train_res==2)))
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

Before OverSampling, counts of label '2': 2845
Before OverSampling, counts of label '1': 16159
Before OverSampling, counts of label '0': 2953 

After OverSampling, the shape of train_X: (48477, 83)
After OverSampling, the shape of train_y: (48477,) 

After OverSampling, counts of label '2': 16159
After OverSampling, counts of label '1': 16159
After OverSampling, counts of label '0': 16159


In [17]:
#Balance the test data as well 
X_test_res, y_test_res = sm.fit_sample(X_test, y_test.ravel())

**Re-fit on Gradient Boost Model to see if it improves the score** 

In [18]:
from sklearn.ensemble import GradientBoostingClassifier

In [19]:
#Gradient Boosting
# Accuracy for GB is not that strong - need to explore AUC ROC
gb = fit_and_evaluate(X_train_res, y_train_res, model_name='gb', model_type= GradientBoostingClassifier())

For model: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
Score: 0.7181343730016296


In [20]:
gb = fit_and_evaluate(X_test_res, y_test_res, model_name='gb', model_type= GradientBoostingClassifier())

For model: GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
Score: 0.7414902834509222


In [21]:
#Predictions 
preds_gb = gb.predict(X_test_res)

Since the classes were unbalanced, I want to use AUC ROC score rather than accuracy. However there is not Multiclass AUC ROC score. I found this medium article: 
https://medium.com/@plog397/auc-roc-curve-scoring-function-for-multi-class-classification-9822871a6659

In [22]:
#AUC ROC score for multiclass classification

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer

def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

In [23]:
multiclass_roc_auc_score(y_test_res, preds_gb)

0.8061177125881915

Smote helped to balance the classes, and the Gradient Boosting model has a decent AUC ROC score. However, the Random Forests model has a stronger accuracy score. 

#### GridSearch through GradientBoost

Commenting out the below because my computer can't handle the gridsearch 

In [24]:
# #Gradient Boosting
# gb = GradientBoostingClassifier(random_state=42)

# #Indicate the gridearch params 
# gb_params = {'learning_rate': [0.1],
#              'max_depth': [3, 5], 
#              'min_samples_leaf': [1, 3],
#              'min_samples_split': [2, 5], 
#              'n_estimators': [10, 50]}

# gs = GridSearchCV(gb, param_grid=gb_params, cv=5)
# gs.fit(X_train_sc, y_train)
# print(gs.best_score_)        
# gs.best_params_

# Evaluate Predictions

Based on the above, I have decided to move forward with the Random Forests model. 

In [25]:
#Generate predictions for random forests
preds_rf = rf.predict(X_test)

In [26]:
#Evaluate the AUC ROC for Random Forests 
multiclass_roc_auc_score(y_test, preds_rf)

#Even the AUC ROC score is good!

0.9658995720238511

In [27]:
# Create DataFrame with column for predicted values
results = pd.DataFrame({'predicted': preds_rf, 
                        'actual': y_test})

#Find the rows with incorrect predictions 
misclass = results[results['predicted'] != results['actual']]
print(misclass.shape)
misclass.head()

(134, 2)


Unnamed: 0,predicted,actual
8978,1,0
24378,1,2
27499,1,2
25818,1,2
23182,1,2


In [28]:
#Confusion Matrix 
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, 
                 preds_rf) 

#Here we can see the misclassified predictions 

array([[ 943,   41,    1],
       [   3, 5383,    0],
       [   1,   88,  859]])