In [1]:
import gzip
import ast
import json
from sklearn import linear_model
import random
import numpy as np

In [2]:
# Parse each json object
def read_JSON(path):
    for l in gzip.open(path, 'r'):
        yield json.loads(l)

## 1. Dataset Exploratory Analysis

Dataset: https://cseweb.ucsd.edu/~jmcauley/datasets.html#clothing_fit (Rent The Runway)

In [31]:
# Load data
data = []
for d in read_JSON('renttherunway_final_data.json.gz'):
    data.append(d)

# Filter data without a rating
for d in data:
    if not d['rating']:
        data.remove(d)


###  Basic Statistics and Properties

In [38]:
# Size of dataset
print('Size of original data set =', len(data), 'samples')

# Number of features
print('Number of features =', len(data[0]), 'features')

Size of original data set = 192462 samples
Number of features = 15 features


In [37]:
# Example sample
data[0]

{'fit': 'fit',
 'user_id': '420272',
 'bust size': '34d',
 'item_id': '2260466',
 'weight': '137lbs',
 'rating': '10',
 'rented for': 'vacation',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': 'hourglass',
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': '5\' 8"',
 'size': 14,
 'age': '28',
 'review_date': 'April 20, 2016'}

### Feature Categories

* Categorical Features: fit, user_id, bust_size, item_id, rented_for, category, body_type, review_date

* Numerical Features: weight, size, height, age

* Ordinal Features rating, review_text, review_summary

## 2.

### Predictive Task

Our predictive task is predicting a user's rating on a review of their clothing fit on some given features.

### Baseline Model

In [40]:
# Baseline 1: Predict a rating of 10 if the fit = 'fit', otherwise randomly predict a rating in the range
# of all other rating in the data set

other_ratings = set()
for d in data:
    if d['rating'] != '10':
        other_ratings.add(d['rating'])
print('Set of other ratings:', other_ratings)


def baseline(data, y):
    predictions = []
    for d in data:
        if d['fit'] == 'fit':
            predictions.append('10')
        else:
            predictions.append(random.choice(list(other_ratings)))
    return predictions



y = [d['rating'] for d in data]
predictions = baseline(data, y)
correct = [y[i] == predictions[i] for i in range(len(y))]
print('Baseline Accuracy =', sum(correct) / len(y))

Set of other ratings: {'2', '6', '4', '8'}
Baseline Accuracy = 0.5581101723976681


## 3. Justification for proposed model, optimizations, issues, model alternatives

## 4.

## 5.

### Dataset citation:

#### Decomposing fit semantics for product size recommendation in metric spaces

Rishabh Misra, Mengting Wan, Julian McAuley

*RecSys, 2018*

http://cseweb.ucsd.edu/~jmcauley/pdfs/recsys18e.pdf