In [122]:
import json
from collections import defaultdict
from sklearn import linear_model
import numpy
import random
import gzip
import math
import re
import string

In [123]:
f = gzip.open("renttherunway_final_data.json.gz")
dataset = []
for l in f:
    dataset.append(json.loads(l))

In [124]:
dataset[0]

{'fit': 'fit',
 'user_id': '420272',
 'bust size': '34d',
 'item_id': '2260466',
 'weight': '137lbs',
 'rating': '10',
 'rented for': 'vacation',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': 'hourglass',
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': '5\' 8"',
 'size': 14,
 'age': '28',
 'review_date': 'April 20, 2016'}

## Data Cleaning

In [125]:
# Pre-precessing the orginial data set 
data = dataset

In [126]:
# Drop all ratings having null values
data = [d for d in dataset if d['rating'] != None]
print(len(data), len(dataset))

192462 192544


In [127]:
# Drop all 'age' having null values
data = [d for d in data if 'age' in d]
print(len(data), len(dataset))

191503 192544


In [128]:
# Drop all 'weight' having null values
data = [d for d in data if 'weight' in d]
print(len(data), len(dataset))

162153 192544


In [129]:
# Drop all 'height' having null values
data = [d for d in data if 'height' in d]
print(len(data), len(dataset))

161716 192544


In [130]:
# Drop all 'body type' having null values
data = [d for d in data if 'body type' in d and d['body type'] != None]
print(len(data), len(dataset))

153110 192544


In [131]:
# Drop all 'rented for' having null values
data = [d for d in data if 'rented for' in d and d['body type'] != None]
print(len(data), len(dataset))

153100 192544


In [132]:
for d in data:
    # Convert 'rating' from string to int and change from 2,4,6,8,10 to 1,2,3,4,5
    d['rating'] = int(d['rating']) // 2
    # Convert 'age' from string to int
    d['age'] = int(d['age'])
    # Drop 'lbs' unit and convert 'weight' from str to int
    d['weight'] = int(d['weight'][:-3])
    # Drop unit and convert 'height' from str to int
    feet,inch = re.findall(r'\d+', d['height'])
    d['height'] = int(feet) * 12 + int(inch)

In [133]:
# Pre-processing 'review_text' by ignoring capitalization and removing punctuation
sp = set(string.punctuation)
def cleanText(text):
    text = ''.join([c for c in text.lower() if not c in sp])
    return text

In [134]:
data[68]

{'fit': 'fit',
 'user_id': '203660',
 'bust size': '34c',
 'item_id': '1126889',
 'weight': 160,
 'rating': 3,
 'rented for': 'party',
 'review_text': "The dress is absolutely gorgeous, unfortunately the dress proportions were off for my height. I would have loved to wear this out, but I couldn't because the lengths were awkward on my body. ",
 'body type': 'athletic',
 'review_summary': 'Too Long',
 'category': 'dress',
 'height': 64,
 'size': 12,
 'age': 28,
 'review_date': 'January 3, 2017'}

In [135]:
print(len(data), len(dataset)) 

153100 192544


## Linear Regression Model

In [136]:
# use the one-hot encoding for 'fit' 
def feature(datum):
    # ...
    ft = [1] 
    ft.append(len(datum['review_text'])/max_length)
    ft.append(datum['size'])
    ft.append(datum['weight'])
    ft.append(1.0 * (datum['fit'] == 'small'))
    ft.append(1.0 * (datum['fit'] == 'large'))
    return ft

In [137]:
ratings = [d['rating'] for d in data]
lengths = [len(d['review_text']) for d in data]
max_length = max(lengths)

In [138]:
X = [feature(d) for d in data]
Y = ratings

In [139]:
# Splitting dataset into 80% training: 20% test 
testX, trainX = X[:len(X)//5], X[len(X)//5:]
testY, trainY = Y[:len(Y)//5], Y[len(Y)//5:]

In [140]:
# Final result on test set 
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(trainX, trainY)
y_pred = model.predict(testX)
sse2 = sum([x**2 for x in (testY - y_pred)])
finalMSE = sse2 / len(testY)
finalMSE 

0.47256799920300047