In [1]:
import json
from collections import defaultdict
from sklearn import linear_model
import numpy
import random
import gzip
import math
import re
import string

In [2]:
f = gzip.open("renttherunway_final_data.json.gz")
dataset = []
for l in f:
    dataset.append(json.loads(l))

In [3]:
dataset[0]

{'fit': 'fit',
 'user_id': '420272',
 'bust size': '34d',
 'item_id': '2260466',
 'weight': '137lbs',
 'rating': '10',
 'rented for': 'vacation',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': 'hourglass',
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': '5\' 8"',
 'size': 14,
 'age': '28',
 'review_date': 'April 20, 2016'}

## Data Cleaning

In [4]:
# Pre-precessing the orginial data set 
data = dataset

In [5]:
# Drop all ratings having null values
data = [d for d in dataset if d['rating'] != None]
print(len(data), len(dataset))

192462 192544


In [6]:
# Drop all 'age' having null values
data = [d for d in data if 'age' in d]
print(len(data), len(dataset))

191503 192544


In [7]:
# Drop all 'weight' having null values
data = [d for d in data if 'weight' in d]
print(len(data), len(dataset))

162153 192544


In [8]:
# Drop all 'height' having null values
data = [d for d in data if 'height' in d]
print(len(data), len(dataset))

161716 192544


In [9]:
# Drop all 'body type' having null values
data = [d for d in data if 'body type' in d and d['body type'] != None]
print(len(data), len(dataset))

153110 192544


In [10]:
# Drop all 'rented for' having null values
data = [d for d in data if 'rented for' in d and d['body type'] != None]
print(len(data), len(dataset))

153100 192544


In [11]:
for d in data:
    # Convert 'rating' from string to int and change from 2,4,6,8,10 to 1,2,3,4,5
    d['rating'] = int(d['rating']) // 2
    # Convert 'age' from string to int
    d['age'] = int(d['age'])
    # Drop 'lbs' unit and convert 'weight' from str to int
    d['weight'] = int(d['weight'][:-3])
    # Drop unit and convert 'height' from str to int
    feet,inch = re.findall(r'\d+', d['height'])
    d['height'] = int(feet) * 12 + int(inch)

In [12]:
# Pre-processing 'review_text' by ignoring capitalization and removing punctuation
sp = set(string.punctuation)
def cleanText(text):
    text = ''.join([c for c in text.lower() if not c in sp])
    return text

In [13]:
data[68]

{'fit': 'fit',
 'user_id': '203660',
 'bust size': '34c',
 'item_id': '1126889',
 'weight': 160,
 'rating': 3,
 'rented for': 'party',
 'review_text': "The dress is absolutely gorgeous, unfortunately the dress proportions were off for my height. I would have loved to wear this out, but I couldn't because the lengths were awkward on my body. ",
 'body type': 'athletic',
 'review_summary': 'Too Long',
 'category': 'dress',
 'height': 64,
 'size': 12,
 'age': 28,
 'review_date': 'January 3, 2017'}

In [14]:
print(len(data), len(dataset)) 

153100 192544


## Linear Regression Model

In [15]:
# use the one-hot encoding for 'fit' 
def feature(datum):
    # ...
    ft = [1] 
    ft.append(len(datum['review_text'])/max_length)
    ft.append(datum['size'])
    # ft.append(datum['height'])
    ft.append(datum['weight'])
    ft.append(1.0 * (datum['fit'] == 'small'))
    ft.append(1.0 * (datum['fit'] == 'large'))
    return ft

In [16]:
ratings = [d['rating'] for d in data]
lengths = [len(d['review_text']) for d in data]
max_length = max(lengths)

In [17]:
X = [feature(d) for d in data]
Y = ratings

In [18]:
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(X, Y)
y_pred = model.predict(X)
sse = sum([x**2 for x in (Y - y_pred)])
mse = sse / len(Y)
mse

0.48267930072288345

In [19]:
#  Looking for the right split 
ratio = 0 
for i in range (2,10): 
    test2, train2 = X[:len(X)//i], X[len(X)//i:]
    testY, trainY = Y[:len(Y)//i], Y[len(Y)//i:]
    model2 = linear_model.LinearRegression(fit_intercept=False)
    model2.fit(train2, trainY)
    y2_pred = model2.predict(test2)
    sse2 = sum([x**2 for x in (testY - y2_pred)])
    currMSE = sse2 / len(testY)
    print("i=",i," - Current MSE=", currMSE) 
    if currMSE < mse: 
        mse = currMSE 
        ratio = i 

i= 2  - Current MSE= 0.4804257977616084
i= 3  - Current MSE= 0.4768978345806681
i= 4  - Current MSE= 0.47165313243643536
i= 5  - Current MSE= 0.4725679992027816
i= 6  - Current MSE= 0.47484218254336413
i= 7  - Current MSE= 0.4690256118406555
i= 8  - Current MSE= 0.4686945463164344
i= 9  - Current MSE= 0.4675537408238754


In [20]:
ratio, mse

(9, 0.4675537408238754)

In [21]:
# Final result on validation set 
test2, train2 = X[:len(X)//ratio], X[len(X)//ratio:]
testY, trainY = Y[:len(Y)//ratio], Y[len(Y)//ratio:]
model2 = linear_model.LinearRegression(fit_intercept=False)
model2.fit(train2, trainY)
y2_pred = model2.predict(test2)
sse2 = sum([x**2 for x in (testY - y2_pred)])
currMSE = sse2 / len(testY)
currMSE 

0.4675537408238754