In [1]:
import gzip
import ast
import json
from sklearn import linear_model
import random
import numpy as np

In [2]:
# Parse each json object
def read_JSON(path):
    for l in gzip.open(path, 'r'):
        yield json.loads(l)

In [3]:
# Load data
data = []
for d in read_JSON('renttherunway_final_data.json.gz'):
    data.append(d)
# Filter data without a rating
for d in data:
    if not d['rating']:
        data.remove(d)
data[1]

{'fit': 'fit',
 'user_id': '273551',
 'bust size': '34b',
 'item_id': '153475',
 'weight': '132lbs',
 'rating': '10',
 'rented for': 'other',
 'review_text': 'I rented this dress for a photo shoot. The theme was "Hollywood Glam and Big Beautiful Hats". The dress was very comfortable and easy to move around in. It is definitely on my list to rent again for another formal event. ',
 'body type': 'straight & narrow',
 'review_summary': 'I felt so glamourous!!!',
 'category': 'gown',
 'height': '5\' 6"',
 'size': 12,
 'age': '36',
 'review_date': 'June 18, 2013'}

## 2.

In [4]:
# Do a baseline prediction. If an item fits, predict a rating of 10.
# Else predict a random value from [2, 4, 6, 8], as the ratings are on an interval of 2
bad_fit_ratings = [2, 4, 6, 8]
correct = 0
for d in data:
    if d['fit'] == 'fit' and d['rating'] == str(10):
        correct += 1
    else:
        rating = random.choice(bad_fit_ratings)
        if d['rating'] == str(rating):
            correct += 1
correct / len(data)

0.6124481715871185

In [7]:
# Find some distinct features for encoding
types_rented_for = set()
types_ratings = set()
types_fits = set()
for d in data:
    types_ratings.add(d['rating'])
    types_fits.add(d['fit'])
#     types_rented_for.add(d["rented for"])
types_fits, types_ratings

({'fit', 'large', 'small'}, {'10', '2', '4', '6', '8'})

### Dataset citation:

#### Decomposing fit semantics for product size recommendation in metric spaces

Rishabh Misra, Mengting Wan, Julian McAuley

*RecSys, 2018*

http://cseweb.ucsd.edu/~jmcauley/pdfs/recsys18e.pdf