# Content-Based Recommender Systems

## Imports

In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Reading Movie Data

In [129]:
movies_df = pd.read_csv("movies.csv")
print(movies_df.shape)
movies_df.head()

(34208, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Preprocessing on Movies Data

In [130]:
movies_df["year"] = movies_df.title.str.extract(r"(\(\d\d\d\d\))", expand=False)
movies_df["year"] = movies_df.year.str.extract(r"(\d\d\d\d)", expand=False)
movies_df["title"] = movies_df.title.str.replace(r"( \(\d\d\d\d\))", "", regex=True)

movies_df["genres"] = movies_df["genres"].fillna("")
movies_df["genres"] = movies_df.genres.str.split("|")

for index, row in movies_df.iterrows():
    for genre in row["genres"]:
        movies_df.at[index, genre] = 1


movies_df = movies_df.fillna(0)

print(movies_df.shape)
movies_df.head()

(34208, 24)


Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Reading Rating Data

In [131]:
ratings_df = pd.read_csv("ratings.csv")
print(ratings_df.shape)
ratings_df.head()

(22884377, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


## Preprocessing on Ratings Data

In [132]:
ratings_df = ratings_df.drop("timestamp", axis=1) # 1: column; 0: row;
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


## Watched User Movies

In [140]:
userInput = [
    {"title":"Breakfast Club, The", "rating": 5},
    {"title":"Toy Story", "rating": 3.5},
    {"title":"Jumanji", "rating": 2},
    {"title":"Pulp Fiction", "rating": 5},
    {"title":"Akira", "rating": 4.5}
]

inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


## ***

In [141]:
inputIds = movies_df[movies_df["title"].isin(inputMovies["title"].tolist())]
inputMovies = pd.merge(inputIds, inputMovies)

inputMovies = inputMovies.drop(["title", "movieId", "genres", "year"], axis=1)

inputMovies

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,...,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed),rating
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
