In [1]:
# Import necessary libraries
import pandas as pd

# 1. Introduction

## 1.1 Business Problem

The task is to make predictions for a user with a given ID using item-based recommendation methods. 5 recommendations should be provided.

## 1.2 Dataset Story

The dataset has been furnished by MovieLens, a movie recommendation service. It comprises movies accompanied by the
respective rating scores assigned to them. In total, the dataset incorporates 20,000,263 ratings spanning across 27,278
movies. The dataset was curated on October 17, 2016, capturing data from 138,493 users within the period from January 9,
1995, to March 31, 2015. The users were selected at random, and it is noted that each of the chosen users has provided
ratings for a minimum of 20 movies.

## 1.3 Features

`movie.csv`

- `movieId` - Unique movie identifier
- `title` - Movie title
- `genres` - Genre

`rating.csv`

- `userid` - Unique user identifier
- `movieId` - Unique movie identifier
- `rating` - User-assigned rating for the film
- `timestamp` - Date of the rating

# 2. Data Handling

## 2.1 Loading Data

In [2]:
# Load the dataset
movies = pd.read_csv('movie.csv')
ratings = pd.read_csv('rating.csv', parse_dates=['timestamp'])

In [3]:
# Display the first few rows
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Display the first few rows
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


## 2.2 Inspecting Data

In [5]:
# Display basic statistics about the dataset
movies.describe()

Unnamed: 0,movieId
count,27278.0
mean,59855.48057
std,44429.314697
min,1.0
25%,6931.25
50%,68068.0
75%,100293.25
max,131262.0


In [6]:
# Display basic statistics about the dataset
ratings.describe()

Unnamed: 0,userId,movieId,rating
count,20000260.0,20000260.0,20000260.0
mean,69045.87,9041.567,3.525529
std,40038.63,19789.48,1.051989
min,1.0,1.0,0.5
25%,34395.0,902.0,3.0
50%,69141.0,2167.0,3.5
75%,103637.0,4770.0,4.0
max,138493.0,131262.0,5.0


In [7]:
# Display information about the dataset
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [8]:
# Display information about the dataset
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   userId     int64         
 1   movieId    int64         
 2   rating     float64       
 3   timestamp  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 610.4 MB


In [9]:
# Identify columns with null values
movies.isnull().any()

movieId    False
title      False
genres     False
dtype: bool

In [10]:
# Identify columns with null values
ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [11]:
# Get the number of unique values in each column
movies.nunique()

movieId    27278
title      27262
genres      1342
dtype: int64

In [12]:
# Get the number of unique values in each column
ratings.nunique()

userId         138493
movieId         26744
rating             10
timestamp    15351121
dtype: int64

## 2.3 Data Cleaning and Preprocessing

In [13]:
# Merge movies onto ratings
data = ratings.merge(movies, on='movieId', how='left')

# Display the merged DataFrame
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [14]:
# Create DataFrame with counts of votes for each movie
vote_counts = pd.DataFrame(data['movieId'].value_counts())

# Select movies with vote counts equal to or exceeding 1000
popular_movies = vote_counts[vote_counts['movieId'] >= 1000]

# Filter original data to include only popular movies
popular_movies = data[data['movieId'].isin(popular_movies.index)]

# Display popular movies
popular_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [15]:
# Create user-movie pivot table with ratings
user_movie_table = popular_movies.pivot_table(values='rating', index='userId', columns='title')

# Display pivot table
user_movie_table.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [16]:
# Select a random user from the data
random_user = 32344

# Find the latest 5-star rated movie by the user
user_top_pick = \
    data[(data['userId'] == random_user) & (data['rating'] == 5)].sort_values(by='timestamp', ascending=False).head(1)[
        'movieId'].values[0]

In [17]:
# Filter table by movieId
filtered_table = user_movie_table[movies[movies['movieId'] == user_top_pick]['title'].values[0]]

# 3. Data Analysis

In [18]:
# Calculate movie correlations with the user's top pick for recommendations
user_movie_table.corrwith(filtered_table).sort_values(ascending=False)[1:6]

title
Swan Princess, The (1994)           0.507791
Invisible Man, The (1933)           0.505499
Ip Man (2008)                       0.479452
General, The (1926)                 0.463442
7th Voyage of Sinbad, The (1958)    0.439900
dtype: float64