### Step 1: Import packages

In [1]:
import pandas as pd
import numpy as np

### Step 2: Define working directories

In [2]:
path_raw_data = 'C:/users/lbros/documents/mids/w207/final_project/raw_data/'
path_clean_data = 'C:/users/lbros/documents/mids/w207/final_project/clean_data/'

### Step 3: Read data

#### Read raw ratings data

In [3]:
# load ratings dataframe
ratings_df = pd.read_csv(path_raw_data + 'ratings.csv')

In [4]:
# print dataframe shape
ratings_df.shape

(26024289, 4)

In [5]:
# print dataframe columns
ratings_df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [6]:
# inspect first five rows
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


#### Read raw links data

In [7]:
# load links dataframe
links_df = pd.read_csv(path_raw_data + 'links.csv')

In [8]:
# print dataframe shape
links_df.shape

(45843, 3)

In [9]:
# print dataframe columns
links_df.columns

Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')

In [10]:
# inspect first five rows
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


#### Read clean movies data

In [11]:
# load movies dataframe
movies_df = pd.read_csv(path_clean_data + 'movies_temp.csv')

In [12]:
# print dataframe shape
movies_df.shape

(45464, 180)

In [13]:
# print dataframe columns
movies_df.columns

Index(['Unnamed: 0', 'id', 'imdb_id', 'adult', 'belongs_to_collection',
       'budget', 'originally_english', 'overview', 'popularity',
       'production_companies',
       ...
       'zu', 'canceled', 'in-production', 'planned', 'post-production',
       'released', 'rumored', 'cast_names', 'crew_names', 'description'],
      dtype='object', length=180)

In [14]:
# inspect first five rows
movies_df.head()

Unnamed: 0.1,Unnamed: 0,id,imdb_id,adult,belongs_to_collection,budget,originally_english,overview,popularity,production_companies,...,zu,canceled,in-production,planned,post-production,released,rumored,cast_names,crew_names,description
0,0,862,tt0114709,0,1,30000000,1,led woodi andi toy live happili room andi birt...,21.946943,pixar animation studios,...,0,0,0,0,0,1,0,johnratzenberger rleeermey donrickles erikvond...,annmrockwell jeffpratt ashbrannon mickiemcgowa...,boy next door boy friendship friend rivalri to...
1,1,8844,tt0113497,0,0,65000000,1,sibl judi peter discov enchant board game open...,17.015539,interscope communications teitler film tristar...,...,0,0,0,0,0,1,0,bonniehunt laurabellbundy jameshandy gillianba...,gregtaylor thomaseackerman williamteitler robe...,base children book board game giant insect new...
2,2,15602,tt0113228,0,1,0,1,famili wed reignit ancient feud nextdoor neigh...,11.7129,warner bros. lancaster gate,...,0,0,0,0,0,1,0,burgessmeredith annmargret darylhannah jacklem...,jackkeller howarddeutch markstevenjohnson,best friend old men duringcreditssting fish
3,3,31357,tt0114885,0,0,16000000,1,cheat mistreat step women hold breath wait elu...,3.859495,twentieth century fox film corporation,...,0,0,0,0,0,1,0,lamontjohnson angelabassett lelarochon whitney...,deborahschindler ezraswerdlow forestwhitaker c...,interraci relationship chick flick divorc sing...
4,4,11862,tt0113041,0,1,0,1,georg bank recov daughter wed receiv news she ...,8.387519,touchstone pictures sandollar productions,...,0,0,0,0,0,1,0,janeadams kieranculkin stevemartin lorialan ki...,adambernardi nancymeyers elliotdavis alansilve...,gynecologist daughter pregnanc midlif crisi ba...


### Step 4: Pre-process ratings data

#### Add imdbId field to ratings_df

In [15]:
# set movieId as index on links_df
links_df.set_index('movieId', inplace=True)

In [16]:
# set userId and movieId as indexes on ratings_df
ratings_df.set_index(['userId', 'movieId'], inplace=True)

In [17]:
# insert imdbId and tmdbId fields on ratings_df left joining with links_df on movieId index
ratings_df = ratings_df.join(links_df, on='movieId', how='left').reset_index()

In [18]:
# drop tmdbId
ratings_df.drop('tmdbId', axis=1, inplace=True)

In [19]:
# print dataframe shape
ratings_df.shape

(26024289, 5)

In [20]:
# print dataframe columns
ratings_df.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'imdbId'], dtype='object')

In [21]:
# inspect first five rows
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId
0,1,110,1.0,1425941529,112573
1,1,147,4.5,1425942435,112461
2,1,858,5.0,1425941523,68646
3,1,1221,5.0,1425941546,71562
4,1,1246,5.0,1425941556,97165


#### Binarize ratings

Define **binarize_ratings** function

In [22]:
def binarize_ratings(ratings_df, threshold=4):
    
    '''Binarize ratings:
    - 1 for ratings equal or above threshold
    - 0 for ratings below threshold
    params: ratings_df, threshold
    return: ratings_df with binarized ratings
    '''
    
    # binarize ratings field
    ratings_df['rating'] = ratings_df['rating'].apply(lambda x: 1 if x >= 4 else 0)
    
    return ratings_df

The ratings frequency are **equally split between <4 and >=4**.

In [23]:
# apply binarize function to ratings_df
ratings_df = binarize_ratings(ratings_df)

In [24]:
# inspect binarized ratings frequency
ratings_df.rating.value_counts(normalize=True)

0    0.501168
1    0.498832
Name: rating, dtype: float64

### Step 5: Evaluate keys to join ratings with movies data

#### Evaluate movieId

The movieId **do not seem to be an effective key** to join the datasets.

In [25]:
rated_movies = ratings_df.movieId.unique()
print('There are {} unique movies rated in the ratings_df.'.format(len(rated_movies)))

There are 45115 unique movies rated in the ratings_df.


In [26]:
repr_movies = movies_df.id.unique()
print('There are {} unique movies represented in the movies_df.'.format(len(repr_movies)))

There are 45430 unique movies represented in the movies_df.


In [27]:
rated_not_in_repr = rated_movies[~np.isin(rated_movies, repr_movies)]
print('There are {} movies in the ratings_df not in the movies_df.'.format(len(rated_not_in_repr)))
rated_not_in_repr

There are 37550 movies in the ratings_df not in the movies_df.


array([  1221,   2918,   4878, ..., 165649, 171051, 171221], dtype=int64)

In [28]:
repr_not_in_rated = repr_movies[~np.isin(repr_movies, rated_movies)]
print('There are {} movies in the movies_df not in the ratings_df.'.format(len(repr_not_in_rated)))
repr_not_in_rated

There are 37865 movies in the movies_df not in the ratings_df.


array([ 15602,  31357,  11862, ...,  67758, 227506, 461257], dtype=int64)

#### Try with imdb_id instead

##### Inspect and clean imdb_id field in movies_df

In [29]:
# no missing values on movies_df imdbd_id...
print('Missing values: ', movies_df['imdb_id'].isna().sum())
# ...but 17 unknown values
unknown = movies_df['imdb_id'][~movies_df['imdb_id'].str.contains('^tt')]
print('Unknow values: ', len(unknown))
unknown

Missing values:  0
Unknow values:  17


8966     unkown
13758    unkown
13822    unkown
17383    unkown
17511    unkown
18960    unkown
19323    unkown
20805    unkown
20936    unkown
21915    unkown
22986    unkown
23743    unkown
33752    unkown
36952    unkown
40807    unkown
41830    unkown
45068    unkown
Name: imdb_id, dtype: object

In [30]:
def clean_imdb_id(movies_df):
    """Drop rows with unknow values from df,
    remove 'tt' and cast into numerical values"""
    
    # drop lines with unknown values
    movies_df = movies_df[movies_df['imdb_id'].str.contains('^tt')]
    
    # convert to numeric format to allow join with ratings_df
    movies_df['imdb_id'] = movies_df['imdb_id'].apply(lambda x: x[3:]).astype('int64')
    
    return movies_df

In [31]:
# apply this function to convert the imdb_id field to numbers
movies_df = clean_imdb_id(movies_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


##### Evaluate imdb_id

In [32]:
rated_movies = ratings_df.imdbId.unique()
print('There are {} unique movies rated in the ratings_df.'.format(len(rated_movies)), '\n')
print('The minimum imdb_id is {}.'.format(rated_movies.min()))
print('The maximum imbd_id is {}.'.format(rated_movies.max()))
print('There are {} imdb_ids above 1,000,000.'.format((rated_movies>1000000).sum()))

There are 45115 unique movies rated in the ratings_df. 

The minimum imdb_id is 1.
The maximum imbd_id is 7158814.
There are 15431 imdb_ids above 1,000,000.


In [33]:
repr_movies = movies_df.imdb_id.unique()
print('There are {} unique movies represented in the movies_df.'.format(len(repr_movies)), '\n')
print('The minimum imdb_id is {}.'.format(repr_movies.min()))
print('The maximum imbd_id is {}.'.format(repr_movies.max()))
print('There are {} imdb_ids above 1,000,000.'.format((repr_movies>1000000).sum()))

There are 44762 unique movies represented in the movies_df. 

The minimum imdb_id is 1.
The maximum imbd_id is 999995.
There are 0 imdb_ids above 1,000,000.


In [34]:
rated_not_in_repr = rated_movies[~np.isin(rated_movies, repr_movies)]
print('There are {} movies in the ratings_df not in the movies_df.'.format(len(rated_not_in_repr)), '\n')
print('{} of them are imdb_ids above 1,000,000.'.format((rated_not_in_repr>1000000).sum()))
rated_not_in_repr

There are 15716 movies in the ratings_df not in the movies_df. 

15431 of them are imdb_ids above 1,000,000.


array([1392170, 1515091, 1645080, ..., 1119178, 3731196, 4287348],
      dtype=int64)

In [35]:
repr_not_in_rated = repr_movies[~np.isin(repr_movies, rated_movies)]
print('There are {} movies in the movies_df not in the ratings_df.'.format(len(repr_not_in_rated)))
repr_not_in_rated

There are 15363 movies in the movies_df not in the ratings_df.


array([333373, 684935, 250194, ..., 209470,  28550, 980792], dtype=int64)

#### Drop non-matching movies from both datasets

In [36]:
def drop_non_matching_movies(ratings_df, movies_df):
    """Drop rows with non matching movies"""
    
    # unique movies in ratings_df
    rated_movies = ratings_df.imdbId.unique()
    # unique movies in movies_df
    repr_movies = movies_df.imdb_id.unique()
    # find the intersection
    commom_movies = np.intersect1d(rated_movies, repr_movies)
    # filter ratings_df with movies in the intersection
    ratings_df = ratings_df[np.isin(ratings_df.imdbId.values, commom_movies)]
    # filter movies_df with movies in the intersection
    movies_df = movies_df[np.isin(movies_df.imdb_id.values, commom_movies)]
    
    return ratings_df, movies_df

In [37]:
# apply this function to drop non-matching movies from datasets
ratings_df, movies_df = drop_non_matching_movies(ratings_df, movies_df)

We dropped **2,066,809 rows** (out of 26,024,289) in the **ratings_df**.

In [38]:
print('There are now {} records in the ratings_df.'.format(ratings_df.shape[0]))

There are now 23957480 records in the ratings_df.


In [39]:
print('There are now {} unique movies rated in the ratings_df.'.format(len(ratings_df.imdbId.unique())))

There are now 29399 unique movies rated in the ratings_df.


We dropped **15,500 rows** (out of 45,464) in the **movies_df**.

In [40]:
print('There are now {} records in the movies_df.'.format(movies_df.shape[0]))

There are now 29964 records in the movies_df.


In [41]:
print('There are now {} unique movies rated in the movies_df.'.format(len(movies_df.imdb_id.unique())))

There are now 29399 unique movies rated in the movies_df.


### Step 6: Filter users with a minimum number of ratings

Define **filter_users** function

In [42]:
def filter_users(ratings_df, min_ratings=30):
    
    '''Filter users with num_ratings equal or above [min_ratings]
    params: ratings_df, min_rating
    return: filtered_ratings_df
    '''
    
    # filter userIDs with count of ratings equal or above min_ratings
    filtered_ratings_df = ratings_df.groupby('userId', sort=False).filter(lambda x: len(x) >= min_ratings)
    
    return filtered_ratings_df

We filtered users with **less than 30 ratings** resulting in a filtered dataset with **131,880 users** (out of 270,177) and **29,232 movies** (out of 29,399).

In [43]:
filtered_ratings_df = filter_users(ratings_df, min_ratings=30)
print('Filtered number of users:',filtered_ratings_df.userId.unique().shape[0])
print('Total number of users:', ratings_df.userId.unique().shape[0])

Filtered number of users: 131880
Total number of users: 270177


In [44]:
print('Filtered number of movies:',filtered_ratings_df.imdbId.unique().shape[0])
print('Total number of movies:',ratings_df.imdbId.unique().shape[0])

Filtered number of movies: 29232
Total number of movies: 29399


### Step 7: Save clean and filtered datasets to a csv file

In [None]:
# write this final dataset to a csv file
filtered_ratings_df.to_csv('C:/Users/lbros/Documents/MIDS/W207/final_project/clean_data/ratings_final.csv')

In [None]:
# write this final dataset to a csv file
movies_df.to_csv('C:/Users/lbros/Documents/MIDS/W207/final_project/clean_data/movies_final.csv')