In [1]:
import numpy as np
from numpy.linalg import norm
import math
from urllib.request import urlopen
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from urllib.request import urlopen
import random

import urllib
import scipy.optimize
import random
from collections import defaultdict # Dictionaries with default values
import nltk
from nltk.util import ngrams
import string
from nltk.stem.porter import *
import ast
from nltk.corpus import stopwords

import gzip
from collections import defaultdict

# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [2]:
# Parse each json object
def read_JSON(path):
    for l in gzip.open(path, 'r'):
        yield json.loads(l)

## 1. Dataset Exploratory Analysis

Dataset: https://cseweb.ucsd.edu/~jmcauley/datasets.html#clothing_fit (Rent The Runway)

In [3]:
# Load data
data = []
for d in read_JSON('renttherunway_final_data.json.gz'):
    data.append(d)

# Filter data without a rating
for d in data:
    if not d['rating']:
        data.remove(d)


###  Basic Statistics and Properties

In [4]:
# Size of dataset
print("Size of original data set =", len(data), "samples")

# Number of features
print("Number of features =", len(data[0]), "features")

Size of original data set = 192462 samples
Number of features = 15 features


In [5]:
# Example sample
data[0]

{'fit': 'fit',
 'user_id': '420272',
 'bust size': '34d',
 'item_id': '2260466',
 'weight': '137lbs',
 'rating': '10',
 'rented for': 'vacation',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': 'hourglass',
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': '5\' 8"',
 'size': 14,
 'age': '28',
 'review_date': 'April 20, 2016'}

In [6]:
# Split training and test data
random.shuffle(data)
training_data = data[:150000]
test_data = data[150000:]

len(test_data)

42462

### Feature Categories

* Categorical Features: fit, user_id, bust_size, item_id, rented_for, category, body_type, review_date

* Numerical Features: weight, size, height, age

* Ordinal Features: rating, review_text, review_summary

## 2. Predictive Task

Our predictive task is predicting a user's rating on a review of their clothing fit on some given features. 

One naive baseline predictive model we can use is simply determining if the customer considered the clothing item as "fit", which means the item was neither "small" nor "large". This is a good sign the the customer was satisfied, so we simply guess the rating to be "10". Otherwise we guess a random lesser rating from all other possible ratings in the training data. To get these other ratings, we iterated through the training data, and put them in a set.

Another naive baseline we can do is selectively find the most common words that are not stop-words used in each rating category. For example, in a "10" words that appear may be "perfect" or "great". We can manually select qualitative words for each category, and our predictor would simply predict that rating if those words appear in a review. We found these words by finding the top 30 most frequent non-stop-words for each rating category in the training data. For reviews that have overlapping words in multiple rating categories, the higher rating category would take precedence.

For our focal predictors, we decided to train logistic regressors with other features // TODO:

We can validify our predictions by // TODO: Our baselines were in the 50-60% accuracy range, which is decent considering there are 5 possible predictions.

### Baseline Models

In [7]:
# Baseline 1: Predict a rating of 10 if the fit = 'fit', otherwise randomly predict a rating in the range
# of all other rating in the data set

# Find ratings that aren't 10
other_ratings = set()
for d in data:
    if d['rating'] != '10':
        other_ratings.add(d['rating'])
print("Set of other ratings:", other_ratings)


def baseline1(data, y):
    predictions = []
    for d in data:
        if d['fit'] == 'fit':
            predictions.append('10')
        else:
            predictions.append(random.choice(list(other_ratings)))
    return predictions

y_test = [d['rating'] for d in test_data]
predictions = baseline1(test_data, y_test)
correct = [y_test[i] == predictions[i] for i in range(len(y_test))]
print("Baseline Accuracy =", sum(correct) / len(y_test))

Set of other ratings: {'6', '4', '8', '2'}
Baseline Accuracy = 0.5576750977344449


In [8]:
# Baseline 2 preparation: Find most common words that aren't stop words

# Find most common words that aren't stop words
punct = string.punctuation

r2_word_count = defaultdict(int)
r4_word_count = defaultdict(int)
r6_word_count = defaultdict(int)
r8_word_count = defaultdict(int)
r10_word_count = defaultdict(int)

rating_count = defaultdict(int)
for d in data:
    t = d['review_text']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # Non-punct characters
    t = ''.join(t) # Convert back to string
    words = t.strip().split() # Tokenizes
    
    rating_count[d['rating']] += 1
    for w in words:
        if w not in stop_words:
            if d['rating'] == '2':
                r2_word_count[w] += 1
            elif d['rating'] == '4':
                r4_word_count[w] += 1
            elif d['rating'] == '6':
                r6_word_count[w] += 1
            elif d['rating'] == '8':
                r8_word_count[w] += 1
            elif d['rating'] == '10':
                r10_word_count[w] += 1

def top_words(word_count):
    counts = [(word_count[w], w) for w in word_count]
    counts.sort()
    counts.reverse()
    return [w[1] for w in counts[:30]]

print("Most frequent words per category")
word_counts = [(2, r2_word_count), (4, r4_word_count), (6, r6_word_count), (8, r8_word_count), (10, r10_word_count)]
for rating, count in word_counts:
    print(rating, ":", top_words(count), "\n")

rating_count = [(rating_count[w], w) for w in rating_count]
rating_count.sort()
rating_count.reverse()
print("Most common rating: " + rating_count[0][1])

Most frequent words per category
2 : ['dress', 'wear', 'fit', 'size', 'would', 'like', 'didnt', 'even', 'small', 'way', 'look', 'short', 'really', 'big', 'im', 'looked', 'wearing', 'tight', 'long', 'made', 'back', 'large', 'could', 'top', 'material', 'fabric', 'also', 'chest', 'couldnt', 'ordered'] 

4 : ['dress', 'fit', 'wear', 'size', 'would', 'didnt', 'like', 'really', 'small', 'look', 'top', 'wearing', 'short', 'also', 'tight', 'way', 'fabric', 'im', 'back', 'long', 'even', 'big', 'looked', 'waist', 'large', 'made', 'material', 'could', 'pretty', 'great'] 

6 : ['dress', 'fit', 'would', 'size', 'wear', 'didnt', 'little', 'like', 'really', 'top', 'great', 'bit', 'look', 'small', 'short', 'tight', 'also', 'im', 'fabric', 'long', 'wearing', 'waist', 'color', 'pretty', 'back', 'wore', 'beautiful', 'work', 'made', 'big'] 

8 : ['dress', 'fit', 'size', 'would', 'little', 'wear', 'great', 'wore', 'bit', 'comfortable', 'loved', 'compliments', 'perfect', 'really', 'like', 'im', 'didnt', 'lo

In [9]:
# Baseline 2: Predict based on specific words that appear in the review

def baseline2(data, y):
    predictions = []
    for d in data:
        # If no review, just choose most common in training set
        if not d['review_text']: # if "review_text" not in d
            predictions.append('10')
            continue
        t = d['review_text']
        t = t.lower() # lowercase string
        t = [c for c in t if not (c in punct)] # Non-punct characters
        t = ''.join(t) # Convert back to string
        words = t.strip().split() # Tokenizes
        if "perfect" in words:
            predictions.append(str(rating_count[0][1]))
        elif "great" in words:
            predictions.append('8')
        elif "little" in words:
            predictions.append('6')
        elif "didnt" in words:
            predictions.append('4')
        elif "couldnt" in words:
            predictions.append('2')
        else:
            predictions.append(str(rating_count[0][1])) # Most common
    return predictions
            
y_test = [d['rating'] for d in test_data]
predictions = baseline2(test_data, y_test)
correct = [y_test[i] == predictions[i] for i in range(len(y_test))]
print("Baseline Accuracy =", sum(correct) / len(y_test))

Baseline Accuracy = 0.49698554001224626


### Logistic regression

In [10]:
# Try logistic regressor that just takes into account the length of a review

def length_feature(datum):
    return [1, len(datum['review_text'])]

X = [length_feature(d) for d in training_data]    
y = [d['rating'] for d in training_data]
model = linear_model.LogisticRegression(max_iter=150000)
model.fit(X, y)

X_test = [length_feature(d) for d in test_data] 
y_test = [d['rating'] for d in test_data]
predictions = model.predict(X_test)
correct = [y[i] == predictions[i] for i in range(len(y_test))]
print("Logistic Accuracy =", sum(correct) / len(y_test))

Logistic Accuracy = 0.649286420799774


## 3. Justification for proposed model, optimizations, issues, model alternatives

## 4. Literature description

## 5. Results and conclusions

### Dataset citation:

#### Decomposing fit semantics for product size recommendation in metric spaces

Rishabh Misra, Mengting Wan, Julian McAuley

*RecSys, 2018*

http://cseweb.ucsd.edu/~jmcauley/pdfs/recsys18e.pdf