# Filter movies for training
As seen in the exploratory-analysis notebook, there are ~18,000 movies in the original dataset with less than 100 ratings. This notebook will filter out these movies, resulting in a dataset with ~8500 movies.

In [1]:
import numpy as np
import pandas as pd
import os
import sys
import pickle
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from importlib import reload
%matplotlib inline
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:80% !important; }</style>")) 

In [2]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 999

### Import custom module/class

In [3]:
cwd = os.getcwd()
path = os.path.join(cwd, '..', 'src')
if not path in sys.path:
    sys.path.append(path)
del cwd, path

In [4]:
from MovieRecommender import MovieFilter

### Load data

In [5]:
cwd = os.getcwd()
movies = pd.read_csv(os.path.join(cwd, "..", "data", "movies.csv"))

In [6]:
movies.shape

(27278, 3)

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings = pd.read_csv(os.path.join(cwd, "..", "data", "ratings.csv"))

In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


## Filter movies

In [10]:
freq = ratings.groupby('movieId')['rating'].count()

In [11]:
freq.head()

movieId
1    49695
2    22243
3    12735
4     2756
5    12161
Name: rating, dtype: int64

In [12]:
mf = MovieFilter(movies)

In [13]:
mf.filter_rating_freq(freq, threshold=100)

filter_rating_freq filtered out 18732 movies. Num before: 27278. Num after: 8546


In [14]:
movies.shape

(27278, 3)

In [18]:
red_movies = mf.movies

In [19]:
red_movies.shape

(8546, 3)

In [20]:
red_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [17]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [21]:
ratings.shape

(20000263, 4)

In [25]:
red_ratings_data = pd.merge(ratings, red_movies[['movieId']], on='movieId')

In [26]:
red_ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,5,2,3.0,851527569
2,13,2,3.0,849082742
3,29,2,3.0,835562174
4,34,2,3.0,846509384


In [33]:
red_ratings_data.shape

(19706281, 4)

## Persist filtered ratings data

In [35]:
cwd = os.getcwd()
red_ratings_data.to_csv(os.path.join(cwd, "..", "data", "ratings_filtered.csv"), index=False)