## Import Packages

In [1]:
# Import libraries.
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import json
import warnings

np.random.seed(0)
random.seed(0)
warnings.filterwarnings('ignore')


## Set Working Directory

In [13]:
path_raw_data = '/Users/rathin/Documents/Projects/mids/W207/W207_movies/data/raw_data/'
path_clean_data = '/Users/rathin/Documents/Projects/mids/W207/W207_movies/data/clean_data/'

## Import Data

In [3]:
#Load data into ratings_df. 
ratings_df = pd.read_csv(path_raw_data + "ratings.csv", low_memory=False)

In [4]:
# Display example of the data
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [5]:
# Show size of the data
ratings_df.shape

(26024289, 4)

## Define Preprocessing Functions

In [6]:
def filter_ratings(ratings_df, min_ratings):
    
    '''
    Filter users out users with the count of ratings below min_ratings
    
    Parameters
    ----------
    ratings_df : A dataframe of ratings data
    min_ratings : The minimum number of ratings a user should have
    
    Returns
    -------
    filtered_df : Dataframe filtered to users with ratings > min_ratings
    '''
    
    # Value counts of user ratings
    user_ratings_counts = ratings_df['userId'].value_counts()
    
    # List of users with ratings counts greater than min_ratings
    filtered_users = user_ratings_counts[user_ratings_counts >= min_ratings].index
    
    # Filter df to list of users above
    filtered_df = ratings_df[ratings_df['userId'].isin(filtered_users)]
    
    return filtered_df

In [7]:
def train_dev_test_split(ratings_df, dev_size=1, test_size=1):
    
    '''
    Split user ratings data into train, development and test sets holding the last 
    [test_size] ratings per user for test and last [dev_size] ratings per user for 
    development, following reverse chronological order
    
    Parameters
    ----------
    ratings_df : A dataframe of ratings data
    dev_size : The number of ratings per user to set asside development data
    test_size : The number of ratings per user to set asside test data
    
    Returns
    -------
    train_df : Dataframe of training ratings data
    dev_df : Dataframe of development ratings data
    test_df :  Dataframe of test ratings data
    '''
    
    # Add a column that ranks each rating in reverse chronolgical grouped by user
    ratings_df['rank'] = ratings_df.groupby(['userId'])['timestamp'] \
                         .rank(method='first', ascending=False).copy()
    
    # Filter to test data using rank column
    test_df = ratings_df[ratings_df['rank'].isin(range(1,1+test_size))][['userId', 'movieId', 'rating']]
    
    # Filter to dev data using rank column
    dev_df = ratings_df[ratings_df['rank'].isin(range(1+test_size, 1+test_size+dev_size))][['userId', 'movieId', 'rating']]
    
    # Filter to train data using rank column
    train_df = ratings_df[ratings_df['rank'] >= (1+test_size+dev_size)][['userId', 'movieId', 'rating']]
        
    return train_df, dev_df, test_df

In [8]:
def create_evalutaion_pools(train_df, test_df, pool_size=100):
    
    '''
    Create pools of movies to evaluate the collaborative filtering models.
    Each pool will contain pool_size movies per user. Each pool includes the 
    movies in the test_df and randomly selects the rest of the movies from 
    those not watched by the user.
    
    Parameters
    ----------
    train_df : A dataframe of the training ratings data
    test_df : A dataframe of the test ratings data
    pool_size : The size of each movie pool
    
    Returns
    -------
    evaluation_pools_dict : A dictionary from user to the pool of movies on 
                            which to evaluate the recommender system
    
    '''
    
    # Find the list of unique users and movies in the training data
    users = set(train_df['userId'].unique())
    movies = set(train_df['movieId'].unique())
    
    # Instantiate the output dictionary
    evaluation_pools_dict = {}
    
    # Loop through each user
    for user in users:
        
        # Find set of movies rated by the user in the training and test data
        train_movies = set(train_df[train_df['userId']==user]['movieId'])
        test_movies = set(test_df[test_df['userId']==user]['movieId'])
        
        # Find the set of movies not rated by the user
        movies_not_rated = movies - train_movies - test_movies
        
        # Randomly sample movies from the movies not rated by the user
        n_movies_to_choose = pool_size - len(test_movies)
        sample_movies_not_rated = set(random.sample(movies_not_rated, n_movies_to_choose))
        
        # Combine the test movies with the random sample of movies
        evaluation_pool = list(test_movies | sample_movies_not_rated)
        
        # Add each evaluation_pool to the output dictionary 
        evaluation_pools_dict[user] = evaluation_pool
    
    return evaluation_pools_dict

## Preprocess Data and Write to Output

In [9]:
# Filter data to those users who have 30+ ratings
filtered_data = filter_ratings(ratings_df, min_ratings = 30)

# Split data so that we take the most recent rating as our test rating, 
# and second most recent rating as our dev rating 
train_df, dev_df, test_df = train_dev_test_split(filtered_data)

# Make development and test movie evaluation pools
dev_evaluation_pools = create_evalutaion_pools(train_df, dev_df)
test_evaluation_pools = create_evalutaion_pools(train_df, test_df)

In [16]:
# Write out processed data to csv
train_df.to_csv(path_clean_data + 'ratings_cf_train.csv')
dev_df.to_csv(path_clean_data + 'dev_cf_train.csv')
test_df.to_csv(path_clean_data + 'test_cf_train.csv')

In [19]:
# Write out evaluation_pools to json
def convert(o):
    if isinstance(o, np.int64): return int(o)  
    raise TypeError

with open(path_clean_data + 'dev_evaluation_pools.json', 'w') as fp:
    dev_evaluation_pools = {int(k):v for k,v in dev_evaluation_pools.items()}
    json.dump(dev_evaluation_pools, fp, indent=4, default=convert)
with open(path_clean_data + 'test_evaluation_pools.json', 'w') as fp:
    test_evaluation_pools = {int(k):v for k,v in test_evaluation_pools.items()}
    json.dump(test_evaluation_pools, fp, indent=4, default=convert )