### Import packages

In [1]:
import pandas as pd
import numpy as np

### Define working directories

In [2]:
path_raw_data = 'C:/users/lbros/documents/mids/w207/final_project/raw_data/'
path_clean_data = 'C:/users/lbros/documents/mids/w207/final_project/clean_data/'

### Read ratings data

In [3]:
# load ratings dataframe
ratings_df = pd.read_csv(path_raw_data + 'ratings.csv')

In [4]:
# print dataframe shape
ratings_df.shape

(26024289, 4)

In [5]:
# print dataframe columns
ratings_df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [6]:
# inspect first five rows
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


### Read links data

In [7]:
# load links dataframe
links_df = pd.read_csv(path_raw_data + 'links.csv')

In [8]:
# print dataframe shape
links_df.shape

(45843, 3)

In [9]:
# print dataframe columns
links_df.columns

Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')

In [10]:
# inspect first five rows
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


### Add imdbId and tmdbId fields to ratings_df

In [11]:
# set movieId as index on links_df
links_df.set_index('movieId', inplace=True)

In [12]:
# set userId and movieId as indexes on ratings_df
ratings_df.set_index(['userId', 'movieId'], inplace=True)

In [13]:
# insert imdbId and tmdbId fields on ratings_df left joining with links_df on movieId index
ratings_df = ratings_df.join(links_df, on='movieId', how='left').reset_index()

In [14]:
# print dataframe shape
ratings_df.shape

(26024289, 6)

In [15]:
# print dataframe columns
ratings_df.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'imdbId', 'tmdbId'], dtype='object')

In [16]:
# inspect first five rows
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,110,1.0,1425941529,112573,197.0
1,1,147,4.5,1425942435,112461,10474.0
2,1,858,5.0,1425941523,68646,238.0
3,1,1221,5.0,1425941546,71562,240.0
4,1,1246,5.0,1425941556,97165,207.0


In [17]:
# all records were linked except by 13,503 records without tmdbId 
ratings_df.isna().sum()

userId           0
movieId          0
rating           0
timestamp        0
imdbId           0
tmdbId       13503
dtype: int64

In [18]:
# 213 unique movieIds could not be linked with corresponding tmdbIds
# let's preferably use imdbId to link ratings data with movies data
len(ratings_df[ratings_df.tmdbId.isna()].movieId.unique())

213

### Filter users with a minimum number of ratings

Define **filter_users** function

In [19]:
def filter_users(ratings_df, min_ratings=30):
    
    '''Filter users with num_ratings equal or above [min_ratings]
    params: ratings_df, min_rating
    return: filtered_ratings_df
    '''
    
    # filter userIDs with count of ratings equal or above min_ratings
    filtered_ratings_df = ratings_df.groupby('userId', sort=False).filter(lambda x: len(x) >= min_ratings)
    
    return filtered_ratings_df

We filtered users with **less than 30 ratings** resulting in a filtered dataset with **136,362 users** (out of 270,896) and **44,975 movies** (out of 45,115).

In [20]:
filtered_ratings_df = filter_users(ratings_df, min_ratings=30)
print('Filtered number of users:',filtered_ratings_df.userId.unique().shape[0])
print('Total number of users:', ratings_df.userId.unique().shape[0])

Filtered number of users: 136362
Total number of users: 270896


In [21]:
print('Filtered number of movies:',filtered_ratings_df.imdbId.unique().shape[0])
print('Total number of movies:',ratings_df.imdbId.unique().shape[0])

Filtered number of movies: 44975
Total number of movies: 45115


### Binarize ratings

Define **binarize_ratings** function

In [22]:
def binarize_ratings(ratings_df, threshold=4):
    
    '''Binarize ratings:
    - 1 for ratings equal or above threshold
    - 0 for ratings below threshold
    params: ratings_df, threshold
    return: ratings_df with binarized ratings
    '''
    
    # binarize ratings field
    ratings_df['rating'] = ratings_df['rating'].apply(lambda x: 1 if x >= 4 else 0)
    
    return ratings_df

The ratings frequency are **equally split between <4 and >=4**.

In [23]:
bin_filtered_ratings_df = binarize_ratings(filtered_ratings_df)

In [28]:
# inspect first five rows
bin_filtered_ratings_df.rating.value_counts(normalize=True)

0    0.503753
1    0.496247
Name: rating, dtype: float64

### Save clean ratings dataset to a csv file

In [31]:
# write this final cleaned dataset to a csv file
bin_filtered_ratings_df.to_csv('C:/Users/lbros/Documents/MIDS/W207/final_project/clean_data/ratings_final.csv')