In [7]:
# predict: source: https://surprise.readthedocs.io/en/stable/FAQ.html
from collections import defaultdict
from tkinter import *
import json
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise.prediction_algorithms import KNNWithMeans
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from tkinter import *

In [8]:
def read_data():
    rent_list = list()
    category_set=set()
    with open('renttherunway_final_data.json') as json_data:
        # read all data in
        for line in json_data:
            rent_entry = json.loads(line.strip())
            # uid, iid, rating: we only keep those lines that have ratings.
            # This is called sparse-matrix form
            rent_extracted_entry = dict()
            if rent_entry['rating']:
                rent_extracted_entry['user_id'] = rent_entry['user_id']
                rent_extracted_entry['item_id'] = rent_entry['item_id']
                rent_extracted_entry['rating'] = float(rent_entry['rating'])
                # first filter data according to users' choice: category
                rent_extracted_entry['category'] = rent_entry['category']
                category_set.add(rent_entry['category'])
                rent_list.append(rent_extracted_entry)
    print("Category:")
    print(category_set)
    print(len(rent_list))
    return rent_list

In [9]:
def filter_needs(rent_list, category):
    # return a dataframe, each line <uid, iid, rating> (raw values: strings)
    rent_df = pd.DataFrame()
    for entry in rent_list:
        if entry['category'] == category:
            rent_df = rent_df.append(entry, ignore_index=True)
    return rent_df[['user_id', 'item_id', 'rating']]

In [10]:
def grid_search(my_df):
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(my_df, reader)

    param_grid = {'k': [1, 3, 5, 10, 15, 20],
                  'min_k': [1, 3, 5, 10],
                  'sim_options': {'name': ['pearson'],
                                  'min_support': [1, 3, 5, 10],
                                  'user_based': [True]}}
    # knnwithmeans decide the formula for prediction
    # sim_options decide the formula for calculating similarities
    # min_k: if there are not enough neighbors, prediction is the mean of all R_ui.
    # sim_options: min_support: min number of common items: if common items < min_support, sim(u, v)=0
    knn_grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
    # data is training + validation, GridSearchCV separates training and validation for me
    # when fit, fit(trainval)
    # get test data through build_anti_testset: users and items are known, but ratings are unknown.
    knn_grid_search.fit(data)
    print(knn_grid_search.best_score['rmse'])
    print(knn_grid_search.best_params['rmse'])
    algo = knn_grid_search.best_estimator['rmse']
    print(algo)
    trainvalset = data.build_full_trainset()
    algo.fit(trainvalset)
    # Then predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainvalset.build_anti_testset()
    predictions = algo.test(testset)
    return algo, trainvalset, predictions

In [11]:
def get_top_n(my_df, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''
    # predictions(list of Prediction objects): The list of predictions, as
    # returned by the test method of an algorithm.
    algo, trainvalset, predictions = grid_search(my_df)
    # First map the predictions to each user. A dictionary of lists.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [12]:
def main():
    rent_list = read_data()

    # get category from user
    category = 'jacket'
    my_df = filter_needs(rent_list, category)
    print(my_df)
    top_n = get_top_n(my_df, n=10)
    print(top_n['4040'])


main()

Category:
{'tank', 'trouser', 'blouson', 'kimono', 'henley', 'crewneck', 'dress', 'shirtdress', 'jacket', 'cape', 'jogger', 'hoodie', 'gown', 'overcoat', 'down', 'tunic', 'for', 'trench', 'blazer', 'knit', 'midi', 'maxi', 'overalls', 'tee', 'jeans', 'sweater', 'leggings', 'poncho', 'peacoat', 'combo', 'trousers', 't-shirt', 'blouse', 'ballgown', 'skirt', 'parka', 'buttondown', 'pullover', 'frock', 'caftan', 'coat', 'culotte', 'culottes', 'pant', 'sweatpants', 'skirts', 'suit', 'pants', 'tight', 'duster', 'vest', 'turtleneck', 'sheath', 'top', 'romper', 'skort', 'cardigan', 'legging', 'kaftan', 'jumpsuit', 'mini', 'sweatshirt', 'sweatershirt', 'shirt', 'print', 'cami', 'bomber', 'shift'}
192462
     user_id  item_id  rating
0     214108  2872079    10.0
1     623308  2529552    10.0
2     755121  2158338    10.0
3     911002  2578061    10.0
4     907296  2248191     8.0
5     132953  2273798    10.0
6     933202  2045492     8.0
7     266127  2096293     8.0
8     240454  2015751    10

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.

Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...

Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.