In [24]:
import json
import ast
import gzip
import random
from collections import defaultdict

def readCSV(path):
    f = gzip.open(path, 'rt', encoding="utf8")
    f.readline()
    for l in f:
        yield ast.literal_eval(l)

In [46]:
# Build the whole data set

In [47]:
path = '/Users/louis/Downloads/renttherunway_final_data.json.gz'
dataset = []
f = gzip.open(path, 'rt', encoding="utf8")
for i in range(10000):
    l = f.readline()
    d = ast.literal_eval(l)
    dataset.append(d)

In [48]:
# Take a look

In [49]:
d = dataset[0]
print(d.keys())
print(d)

dict_keys(['fit', 'user_id', 'bust size', 'item_id', 'weight', 'rating', 'rented for', 'review_text', 'body type', 'review_summary', 'category', 'height', 'size', 'age', 'review_date'])
{'fit': 'fit', 'user_id': '420272', 'bust size': '34d', 'item_id': '2260466', 'weight': '137lbs', 'rating': '10', 'rented for': 'vacation', 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.", 'body type': 'hourglass', 'review_summary': 'So many compliments!', 'category': 'romper', 'height': '5\' 8"', 'size': 14, 'age': '28', 'review_date': 'April 20, 2016'}


In [50]:
# Calculate the average of age and weight in case they don't exist in keys of data

In [51]:
sumWeight = 0
sumAge = 0
countWeight = 0
countAge = 0
for d in dataset:
    if 'weight' in d.keys():
        countWeight += 1
        sumWeight += int(d['weight'][:-3])
    if 'age' in d.keys():
        countAge += 1
        sumAge += int(d['age'])
avgWeight = int(sumWeight / countWeight)
avgAge = int(sumAge / countAge)
print(avgWeight,avgAge)

137 33


In [52]:
# Construct training data and validation data

In [53]:
dataTrain = dataset[:8000]
dataValid = dataset[8000:]

In [54]:
# Generate feature

In [55]:
def feature(d): 
    # print(d)
    x = [1]
    
    size = int(d['size'])
    x.append(size)
    
    year = int(d['review_date'].split(" ")[-1])
    x.append(year)
    
    rating = int(d['rating'])
    x.append(rating)
        
    lengthOfReview = len(d['review_text'])
    x.append(lengthOfReview)
    
    lengthOfReviewSummary = len(d['review_summary'])
    x.append(lengthOfReviewSummary)
    
    if 'weight' in d.keys():
        weight = int(d['weight'][:-3])
        x.append(weight)
    else:
        x.append(avgWeight)
    if 'age' in d.keys():
        age = int(d['age'])
        x.append(age)
    else:
        x.append(avgAge)
    return x

In [56]:
# Construct the features and label

In [57]:
XTrain = []
yTrain = []
for d in dataTrain:
    XTrain.append(feature(d))
    yTrain.append(d['fit'] == 'fit')
XValid = []
yValid = []
for d in dataValid:
    XValid.append(feature(d))
    yValid.append(d['fit'] == 'fit')

In [58]:
# Take a look

In [59]:
XTrain[:10]

[[1, 14, 2016, 10, 221, 20, 137, 28],
 [1, 12, 2013, 10, 221, 23, 132, 36],
 [1, 4, 2015, 10, 198, 88, 137, 116],
 [1, 8, 2014, 8, 465, 48, 135, 34],
 [1, 12, 2016, 10, 559, 31, 145, 27],
 [1, 8, 2016, 8, 195, 31, 138, 45],
 [1, 4, 2017, 10, 158, 31, 112, 27],
 [1, 8, 2013, 10, 52, 63, 118, 65],
 [1, 21, 2016, 10, 254, 26, 137, 27],
 [1, 1, 2016, 10, 708, 78, 114, 33]]

In [None]:
# Fit the model

In [60]:
from sklearn import linear_model
mod = linear_model.LogisticRegression(C = 1) # class_weight = 'balanced') 
mod.fit(XTrain, yTrain)



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [61]:
# Calculate the accuracy

In [62]:
train_predictions = mod.predict(XTrain)
valid_predictions = mod.predict(XValid)

accuracyOnTrain = sum(train_predictions == yTrain) / len(yTrain)
accuracyOnValid = sum(valid_predictions == yValid) / len(yValid) 

print(accuracyOnTrain,accuracyOnValid)

0.748375 0.76
