# Rent the Runway: Recommender

### Import Libraries and Data

In [61]:
#Data cleaning
import numpy as np
import pandas as pd

#Recommender
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
#import clean csv - the data was cleaned in the first notebook 
df = pd.read_csv('./data/rent_the_runway_cleaned_no_dummies.csv')

In [63]:
#Drop the null values 
df.dropna(inplace=True)

In [64]:
#Check the size of the existing data 
df.shape

(29237, 15)

# Recommender 1: Collaborative - user based 

Users are rows 

In [65]:
#Create a pivot table 
pivot_user= df.pivot_table(index = 'user_id', columns= 'item_id', values= 'rating')
pivot_user.head()

item_id,123373,123793,124204,124553,125424,125465,125564,126335,127081,127495,...,2958657,2959486,2960025,2960969,2962646,2963344,2963601,2964470,2965924,2966087
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,,,,,,,,,,,...,,,,,,,,,,
82,,,,,,,,,,,...,,,,,,,,,,
97,,,,,,,,,,,...,,,,,,,,,,
224,,,,,,,,,,,...,,,,,,,,,,
302,,,,,,,,,,,...,,,,,,,,,,


In [66]:
#Create a sparse matrix

import sys
sys.getsizeof(pivot_user)

pivot_sparse_user = sparse.csr_matrix(pivot_user.fillna(0))   

pivot_sparse_user.shape  

sys.getsizeof(pivot_sparse_user)    

56

In [67]:
print(pivot_sparse_user[:20, :])

#gives you what row and what column each rating belongs to (i.e. item 0 and user 91)

  (0, 91)	10.0
  (0, 203)	10.0
  (1, 1664)	10.0
  (2, 426)	10.0
  (3, 527)	10.0
  (3, 2636)	10.0
  (4, 1247)	10.0
  (5, 11)	10.0
  (6, 588)	10.0
  (7, 1653)	10.0
  (8, 1080)	10.0
  (9, 1158)	10.0
  (10, 1355)	8.0
  (11, 3361)	10.0
  (12, 2319)	8.0
  (13, 39)	10.0
  (14, 2980)	8.0
  (15, 621)	10.0
  (16, 835)	8.0
  (16, 1250)	10.0
  (17, 36)	10.0
  (18, 1399)	10.0
  (19, 27)	8.0
  (19, 1117)	8.0


In [68]:
#Calculate Cosine Similarity 

recommender_user = pairwise_distances(pivot_sparse_user, metric='cosine')

recommender_user.shape

(22857, 22857)

In [69]:
recommender_user

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [70]:
#Create distance dataframes 

recommender_user_df = pd.DataFrame(recommender_user, index = pivot_user.index, columns=pivot_user.index)
recommender_user_df.head()

#index and columns are the same because its a square matrix

user_id,9,82,97,224,302,321,332,457,464,526,...,999478,999512,999522,999561,999590,999658,999782,999910,999913,999914
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
82,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
97,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
224,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
302,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [71]:
#define the user id 
i = 224

In [72]:
#Search the terms 

user_list = recommender_user_df[i].sort_values()[1:11]

# Search Qeury 
# q = 'Item ID/ Dress name'
# df[df['item ID'].str.contains(q)]['item ID']

In [73]:
user_list

user_id
237335    0.292893
824930    0.292893
57078     0.292893
420900    0.500000
9         1.000000
671462    1.000000
671428    1.000000
671402    1.000000
671350    1.000000
671216    1.000000
Name: 224, dtype: float64

In [74]:
df[df['user_id'] == 237335]

Unnamed: 0,age,body_type,bust_size,category,fit,height,item_id,rating,rented_for,review_date,review_summary,review_text,size,user_id,weight
4407,21,athletic,34b,dress,2,5. 6,1661123,10,formal affair,2017-12-21,This dress clings in all the right places and ...,The fit of the dress is perfect! It looked nic...,8,237335,140


With the user based recommender, it's working off the similar features of the user. In this case, those similar features relate to size, weight, body type, etc. But given that retail stores often have a diverse range of sizes for different body types, this version of the recommender does not associate with the user's "style". 

# Item based - Items are the rows

In [75]:
#Create a pivot table 
pivot_item = df.pivot_table(index = 'item_id', columns= 'user_id', values= 'rating')
pivot_item.head()


user_id,9,82,97,224,302,321,332,457,464,526,...,999478,999512,999522,999561,999590,999658,999782,999910,999913,999914
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
123373,,,,,,,,,,,...,,,,,,,,,,
123793,,,,,,,,,,,...,,,,,,,,,,
124204,,,,,,,,,,,...,,,,,,,,,,
124553,,,,,,,,,,,...,,,,,,,,,,
125424,,,,,,,,,,,...,10.0,,,,,,,,,


In [76]:
import sys
sys.getsizeof(pivot_item)

pivot_sparse_item = sparse.csr_matrix(pivot_item.fillna(0)) 

pivot_sparse_item.shape                                       

sys.getsizeof(pivot_sparse_item)                              

print(pivot_sparse_item[:20, :])

#gives you what row and what column each movie rating belong to

  (0, 28)	10.0
  (0, 119)	10.0
  (0, 140)	8.0
  (0, 344)	4.0
  (0, 498)	8.0
  (0, 628)	10.0
  (0, 877)	10.0
  (0, 958)	10.0
  (0, 1034)	10.0
  (0, 1518)	10.0
  (0, 1999)	2.0
  (0, 2229)	8.0
  (0, 2703)	4.0
  (0, 2757)	10.0
  (0, 2826)	10.0
  (0, 2933)	10.0
  (0, 3237)	10.0
  (0, 3801)	10.0
  (0, 4374)	8.0
  (0, 4397)	10.0
  (0, 4998)	10.0
  (0, 5016)	8.0
  (0, 5122)	6.0
  (0, 5256)	10.0
  (0, 5284)	8.0
  :	:
  (19, 14920)	10.0
  (19, 15117)	10.0
  (19, 15165)	8.0
  (19, 16322)	10.0
  (19, 16482)	8.0
  (19, 16964)	8.0
  (19, 16968)	10.0
  (19, 17080)	10.0
  (19, 17211)	10.0
  (19, 17446)	10.0
  (19, 17547)	10.0
  (19, 18348)	6.0
  (19, 19177)	10.0
  (19, 19956)	10.0
  (19, 19994)	8.0
  (19, 20263)	10.0
  (19, 20337)	10.0
  (19, 20363)	10.0
  (19, 20536)	8.0
  (19, 20811)	8.0
  (19, 20968)	8.0
  (19, 21550)	10.0
  (19, 21603)	10.0
  (19, 21630)	6.0
  (19, 22741)	10.0


In [77]:
#Calculate Cosine Similarity 

recommender_item = pairwise_distances(pivot_sparse_item, metric='cosine')

recommender_item.shape

(4517, 4517)

In [78]:
#Create distance dataframes 

recommender_item_df = pd.DataFrame(recommender_item, index = pivot_item.index, columns=pivot_item.index)
recommender_item_df.head()

item_id,123373,123793,124204,124553,125424,125465,125564,126335,127081,127495,...,2958657,2959486,2960025,2960969,2962646,2963344,2963601,2964470,2965924,2966087
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
123373,0.0,0.992892,1.0,1.0,1.0,1.0,1.0,1.0,0.987892,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
123793,0.992892,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.993564,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
124204,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
124553,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
125424,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [79]:
#Search the terms 

recommender_item_df[123793].sort_values()[1:11]

# Search Qeury 
# q = 'Item ID/ Dress name'
# df[df['item ID'].str.contains(q)]['item ID']

item_id
2249133    0.936981
881837     0.936981
2376757    0.936981
1219000    0.963616
1643568    0.963616
2834590    0.966612
546398     0.966969
420842     0.974536
2236896    0.975631
2877562    0.979753
Name: 123793, dtype: float64

In [80]:
df[df['item_id'] == 2249133]

Unnamed: 0,age,body_type,bust_size,category,fit,height,item_id,rating,rented_for,review_date,review_summary,review_text,size,user_id,weight
20618,30,petite,32c,top,1,5. 3,2249133,6,party,2017-04-17,Cute top to wear to a party,Very pretty top that I wore for a birthday par...,4,357689,115


The cosine similarities are not great results, many are very close to 1. 

# Content Based

In [81]:
#NLP
from nltk.stem import WordNetLemmatizer
import regex as re

#Recommender
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### NLP Cleaning

In [82]:
#Function to clean the text data 

def clean_text(df):

    df['full_review'] = df['review_summary'] + ' ' + df['review_text']                 
    #Combine selftext & title into one column 
    
    df['full_review'] = [str(words).lower() for words in df['full_review']]    
    #Change all the text to lowercase 
    
    df['full_review'] = [(re.sub(r'[^\w\s]','', word)) for word in df['full_review']]
    #remove puncutation. Got this regex code from stack overflow, checked it in regex101.com and put it in a list comp
    
    df['full_review'] = [(re.sub('[^A-Za-z]+', ' ', text)) for text in df['full_review']]
    #remove numbers
    
    return df

In [83]:
clean_text(df)

Unnamed: 0,age,body_type,bust_size,category,fit,height,item_id,rating,rented_for,review_date,review_summary,review_text,size,user_id,weight,full_review
0,36,hourglass,34d,dress,1,5. 5,815826,10,wedding,2017-09-18,"Good fit, great style, comfortable yet elegant",Rented for early brunch/garden wedding. My to...,20,334577,137,good fit great style comfortable yet elegant r...
1,34,athletic,34c,dress,1,5. 5,1636171,10,work,2017-04-19,Love the fit and fabric!,This dress was perfect. The fabric is thick an...,8,634115,125,love the fit and fabric this dress was perfect...
2,30,hourglass,34a,gown,0,5. 8,438881,10,formal affair,2017-10-18,"Simple black dress, loved the ruffles.",Wore this dress to a Naval Ball. The dress did...,13,988705,124,simple black dress loved the ruffles wore this...
3,32,hourglass,34d,dress,1,5. 8,1392841,10,wedding,2017-08-07,One of my fav rentals,Loved this fun dress. The low V in the front m...,16,977884,158,one of my fav rentals loved this fun dress the...
4,34,pear,34a,dress,2,5. 7,160612,8,party,2015-01-05,Pretty but not meant for small chested girls,"Pretty dress, but I didn't have the bust to fi...",8,795673,135,pretty but not meant for small chested girls p...
5,42,hourglass,36d,dress,1,5. 4,810197,10,party,2017-06-01,"Amazing fit, beautiful material, a little sexy...",This dress was perfect. I felt so good in it....,12,247355,132,amazing fit beautiful material a little sexy l...
6,29,hourglass,36b,dress,1,5. 1,1698166,8,wedding,2016-08-15,Great summer wedding pick!,I loved this dress! I run between a 12 and a 1...,24,193032,185,great summer wedding pick i loved this dress i...
7,30,athletic,34c,gown,1,5. 4,1015626,10,formal affair,2017-04-22,The perfect dress for a sparkly Broadway openi...,This dress is perfect. I don't think I've ever...,12,976631,135,the perfect dress for a sparkly broadway openi...
8,33,hourglass,32a,dress,2,5. 2,172914,8,party,2014-05-19,Tons of Compliments!,I wore this for my bridal shower at a Country ...,4,162849,118,tons of compliments i wore this for my bridal ...
9,31,pear,34a,dress,1,5. 3,576743,10,wedding,2017-07-17,Rented this dress for my brother's wedding.,An excellent dress. Would definitely rent aga...,16,379671,135,rented this dress for my brothers wedding an e...


In [84]:
#Lemmatize
lm= WordNetLemmatizer()

df['full_review'] = [" ".join([lm.lemmatize(w) for w in i.split()]) for i in df['full_review']]

In [85]:
tfidf = TfidfVectorizer(stop_words='english')

In [86]:
tfidf_matrix = tfidf.fit_transform(df['review_summary'])

### Cosine Similarity

Referenced this medium artcile:
https://towardsdatascience.com/how-to-build-from-scratch-a-content-based-movie-recommender-with-natural-language-processing-25ad400eb243

In [87]:
count = CountVectorizer()
text_matrix = count.fit_transform(df['full_review'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(text_matrix, text_matrix)

In [88]:
#These cosine similarities are much better than the collaborative recommender
cosine_sim

array([[1.        , 0.42737223, 0.38925355, ..., 0.3765467 , 0.3685477 ,
        0.58386056],
       [0.42737223, 1.        , 0.57639296, ..., 0.39445737, 0.33052034,
        0.54537208],
       [0.38925355, 0.57639296, 1.        , ..., 0.50562599, 0.40456533,
        0.48662474],
       ...,
       [0.3765467 , 0.39445737, 0.50562599, ..., 1.        , 0.26238207,
        0.3979329 ],
       [0.3685477 , 0.33052034, 0.40456533, ..., 0.26238207, 1.        ,
        0.412737  ],
       [0.58386056, 0.54537208, 0.48662474, ..., 0.3979329 , 0.412737  ,
        1.        ]])

In [89]:
# creating a Series for the item so they are associated to an ordered numerical
# create list to match the indexes
indices = pd.Series(df.index)

#  defining the function that takes in item_id and returns top 10 recommendations 

def recommendations(item_id, cosine_sim = cosine_sim):
    
    # initializing the empty list
    recommended_items = []
    
    # gettin the index of the item that matches the item_id
    idx = indices[indices == item_id].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar items
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching items
    for i in top_10_indexes:
        recommended_items.append(list(df.index)[i])
        
    print(recommended_items)
#         print(cosine_sim)
        
    return recommended_items, cosine_sim

In [90]:
recommendations(29207)

[1867, 26019, 8835, 4167, 2198, 11384, 16851, 26539, 7122, 5245]


([1867, 26019, 8835, 4167, 2198, 11384, 16851, 26539, 7122, 5245],
 array([[1.        , 0.42737223, 0.38925355, ..., 0.3765467 , 0.3685477 ,
         0.58386056],
        [0.42737223, 1.        , 0.57639296, ..., 0.39445737, 0.33052034,
         0.54537208],
        [0.38925355, 0.57639296, 1.        , ..., 0.50562599, 0.40456533,
         0.48662474],
        ...,
        [0.3765467 , 0.39445737, 0.50562599, ..., 1.        , 0.26238207,
         0.3979329 ],
        [0.3685477 , 0.33052034, 0.40456533, ..., 0.26238207, 1.        ,
         0.412737  ],
        [0.58386056, 0.54537208, 0.48662474, ..., 0.3979329 , 0.412737  ,
         1.        ]]))

In [91]:
recs_list = recommendations(29207)

[1867, 26019, 8835, 4167, 2198, 11384, 16851, 26539, 7122, 5245]


In [92]:
for i in recs_list:
    df.index== i
    print(df.loc[i])
    

       age          body_type bust_size    category  fit  height  item_id  \
1867    44          hourglass       34c  shirtdress    1   5. 4   1840637   
26019   39             petite       32d        gown    1   5. 2    141688   
8835    34             petite       32b        gown    1   5. 1    832622   
4167    32          hourglass       38d       dress    1  5. 10    249458   
2198    42  straight & narrow       32c       dress    1   5. 3    172027   
11384   31          hourglass       36d        gown    1   5. 6   1714731   
16851   28          full bust   34ddd/e        gown    0   5. 8    149655   
26539   35           athletic       36c       dress    0   5. 6    345146   
7122    32          full bust      36dd       dress    1   5. 4    125424   
5245    30           athletic       36b      sheath    1   5. 9    987743   

       rating     rented_for review_date  \
1867        6          party  2016-08-05   
26019       8  formal affair  2014-02-09   
8835       10  forma

ValueError: Cannot index with multidimensional key

In [93]:
df[df.index == 1867]

Unnamed: 0,age,body_type,bust_size,category,fit,height,item_id,rating,rented_for,review_date,review_summary,review_text,size,user_id,weight,full_review
1867,44,hourglass,34c,shirtdress,1,5. 4,1840637,6,party,2016-08-05,Very cute and preppy,This dress is so cute! I've been wanting to re...,4,46348,108,very cute and preppy this dress is so cute ive...


In [94]:
df[df.index == 26019]

Unnamed: 0,age,body_type,bust_size,category,fit,height,item_id,rating,rented_for,review_date,review_summary,review_text,size,user_id,weight,full_review
26019,39,petite,32d,gown,1,5. 2,141688,8,formal affair,2014-02-09,Beautiful dress and even better in person!,This was my first experience with Rent the Run...,12,401375,123,beautiful dress and even better in person this...
