# Movie Recommendation System

In [1]:
## Importing Libraries
import os
import pandas as pd
import numpy as np
import glob
from termcolor import colored
import itertools
import warnings
warnings.filterwarnings('ignore')

In [2]:
## Setting working directory
path= 'E:/movies_large'
os.chdir(path)

In [3]:
## Iteratively reading the dataframes from the path provided
filepaths=[]
for name in glob.glob(f'{path}/*.csv'):
    filepaths.append(name)
print(filepaths)

['E:/movies_large\\movies.csv', 'E:/movies_large\\ratings.csv', 'E:/movies_large\\users.csv']


In [4]:
## Making dataframes from the individual files
movies=pd.read_csv(filepaths[0], sep=';')
ratings=pd.read_csv(filepaths[1], sep=';')
users=pd.read_csv(filepaths[2], sep=';')

In [5]:
movies.head(5), ratings.head(5), users.head(5)

(   MovieId                     Title  Year
 0        1  The Shawshank Redemption  1994
 1        2           The Dark Knight  2008
 2        3           American Beauty  1999
 3        4  The Light Between Oceans  2016
 4        5            Apocalypse Now  1979,
    UserId  MovieId  Rating
 0       1        1     5.0
 1       1        2     4.0
 2       1        3     2.0
 3       1        4     1.5
 4       1        5     4.0,
    UserId    Name
 0       1    Andy
 1       2  Isabel
 2       3    John
 3       4  Angela
 4       5    Will)

In [6]:
## Merging all the dataframes to final dataset
dfs=pd.DataFrame()
dfs= pd.merge(movies, ratings, on='MovieId', how='left')
dfs=pd.merge(dfs, users, on= 'UserId', how='left')

In [7]:
print(dfs)

     MovieId                     Title  Year  UserId  Rating    Name
0          1  The Shawshank Redemption  1994       1     5.0    Andy
1          1  The Shawshank Redemption  1994       3     4.5    John
2          1  The Shawshank Redemption  1994       7     5.0   Billy
3          2           The Dark Knight  2008       1     4.0    Andy
4          2           The Dark Knight  2008       4     5.0  Angela
..       ...                       ...   ...     ...     ...     ...
115       29        The Usual Suspects  1995       5     2.0    Will
116       30                    Avatar  2009       2     4.0  Isabel
117       30                    Avatar  2009       3     3.0    John
118       30                    Avatar  2009       5     4.0    Will
119       30                    Avatar  2009       8     2.0  Rachel

[120 rows x 6 columns]


In [8]:
## Data Analysis using pandas profiling

from pandas_profiling import ProfileReport
prof = ProfileReport(dfs)
prof.to_file(output_file='output.html')

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=20.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




# Findings after data analysis

### - 30 Movie titles 
### - Movies from years 1980-2017
### - 8 movie reviewers

In [9]:
print(colored('Movies with highest average reviews', attrs=['bold']))
dfs[['Title', 'Rating']].groupby('Title').mean().sort_values(by='Rating',ascending=False)[0:5].round(2)

[1mMovies with highest average reviews[0m


Unnamed: 0_level_0,Rating
Title,Unnamed: 1_level_1
The Shawshank Redemption,4.83
The Dark Knight,4.67
Good Will Hunting,4.5
Batman Begins,4.5
A Beautiful Mind,4.12


In [10]:
print(colored('Most popular movies among all the reviewers', attrs=['bold']))
dfs[['Title', 'Rating']].groupby('Title').var().sort_values(by='Rating',ascending=False)[0:5].round(2)

[1mMost popular movies among all the reviewers[0m


Unnamed: 0_level_0,Rating
Title,Unnamed: 1_level_1
Terminator 2: Judgment Day,3.25
Shutter Island,3.0
The Notebook,2.79
The Usual Suspects,2.58
American Beauty,2.44


### Finding the reviewers with similar reviews using Pearson Coefficient

In [11]:
print(colored('The users are ', attrs=['bold']))
print(dfs['Name'].unique())

[1mThe users are [0m
['Andy' 'John' 'Billy' 'Angela' 'Rachel' 'Isabel' 'Will' 'Alicia']


In [12]:
## Enter the user for whose movie recommendations has to be generated

token = input('Enter the name of the user from the above list  ')

if token not in dfs['Name'].unique():
    print('Invalid entry, retry')
else:
    print('The user whose movie recommendations is to be made is ', token)

name_combinations=[]
for comb in itertools.combinations(list(dfs['Name'].unique()), 2): ##making all possible user combinations
    name_combinations.append(comb)
    
name_combinations=set(name_combinations) ## removing cases of same names in the list

name_combinations= [x for x in name_combinations if token  in x] ## selecting only combinations of reviewers with given input

Enter the name of the user from the above list  Rachel
The user whose movie recommendations is to be made is  Rachel


In [13]:
''' Function to generate most recommended movie for the given user
    
    Parmeters: 
    df- either the correlation or eucledian datframe calculated for the user
    token- the user for whom recommendation is made
'''

def movie_finder(df, token):
    
    
    ## To find the similarity measuring parameter column
    if 'Correlation' in df.columns:
        score= 'Correlation'
    else:
        score= 'Similarity'
    
    ##finding the correlation or similarity score of users other than the inputted
    df['combined_names']=df.apply(lambda x: list([x['Name1'],x['Name2']]),axis=1)
    df=df.rename(columns= {'combined_names': 'Name'})
    df['Name']= df.Name.apply(lambda x: [i for i in x if i != token])
    df=df.explode('Name')

    ## calculating the weighted score by multiplying the rating with the score
    weighted_avg=pd.merge(df,dfs[['Name','Rating','Title']], on='Name')
    weighted_avg['Weighted_Rating']= weighted_avg['Rating']*weighted_avg[score]
    weighted_avg[weighted_avg['Weighted_Rating'] < 0] =0 ## ommitting negative scores
    weighted_avg=weighted_avg.sort_values('Weighted_Rating', ascending= False)
    movie_list=weighted_avg['Title'].unique() ##list of movies in descending order of weighted rating
    print(colored(f'The most recommended movie for the user {token} are:', attrs=['bold']))
    print(set(movie_list[0:5])) ## best 5 movies of the list

In [14]:
''' Function to measure eucledian distance of two reviewers
    
    Parameters:
    user1, user2- names of the users whose correlation matrix of their movie ratings is to calculated
'''

def correl(user1, user2):
    while user1 != user2: ## names for user1 and user2 must not be the same
        ## making movies and ratings dataframes for the two users
        df1= dfs[[['Name','Title','Rating']]&(dfs['Name']== user1)].sort_values(by='Title',ascending=True).rename(columns={'Rating':f'{user1}_Rating'}) 
        df2= dfs[[['Name','Title','Rating']]&(dfs['Name']== user2)].sort_values(by='Title',ascending=True).rename(columns={'Rating':f'{user2}_Rating'})
        dff= pd.merge(df1,df2, on='Title', how='inner') ##merging the review dataframes based on the same movie title
        correl= dff[[f'{user1}_Rating', f'{user2}_Rating']]
        return(correl.corr().min()[0]) ##returns mean of the similarity score

In [15]:
## Generating the dataframes of two reviewers and the correlation of reviews
print(colored('Most similar reviewers', attrs=['bold']))
corr_list=[]

for name in name_combinations:
    corr_list.append([name[0], name[1], correl(name[0], name[1])]) ## appending list of columns of reviewers and correlation of the reviewers
    
correlations=pd.DataFrame(corr_list, columns=['Name1','Name2', 'Correlation']) ## converting lists to dataframe
correlations= correlations.dropna(axis=0) ## droping columns with NA (NA is for correlation of same reviews)
correlations=correlations.sort_values(by='Correlation', ascending= False).round(2).reset_index(drop=True) ##arranging in descending and order and rounding an dataframe to last two decimals

print('\n')
print(correlations)
print('\n')
movie_finder(correlations, token)
print('\n')

[1mMost similar reviewers[0m


    Name1   Name2  Correlation
0    Andy  Rachel         0.58
1  Angela  Rachel         0.56
2  Rachel    Will         0.35
3  Rachel  Alicia        -0.07
4  Rachel  Isabel        -0.26
5    John  Rachel        -0.33
6   Billy  Rachel        -0.41


[1mThe most recommended movie for the user Rachel are:[0m
{'The Dark Knight', 'Platoon', 'The Shawshank Redemption', 'The Usual Suspects', 'Inglourious Basterds'}




### Finding the reviewers with similar reviews using  euclidean distance

In [16]:
''' Function to measure eucledian distance of two reviewers
    Parameters:
    user1, user2- names of the users whose similarity of their movie ratings is to calculated
'''

def eucledian(user1, user2):
    while user1 != user2: ## names for user1 and user2 must not be the same
        ## making movies and ratings dataframes for the two users
        df1= dfs[[['Name','Title','Rating']]&(dfs['Name']== user1)].sort_values(by='Title',ascending=True).rename(columns={'Rating':f'{user1}_Rating'}) 
        df2= dfs[[['Name','Title','Rating']]&(dfs['Name']== user2)].sort_values(by='Title',ascending=True).rename(columns={'Rating':f'{user2}_Rating'})
        dff= pd.merge(df1,df2, on='Title', how='inner') ##merging the review dataframes based on the same movie title
        eu= dff[[f'{user1}_Rating', f'{user2}_Rating']]
        eu['Similarity']= 1/(1+(eu[f'{user1}_Rating']- eu[f'{user2}_Rating'])**2) ## column with similarity score for each ,ovie
        return eu['Similarity'].mean() ##returns mean of the similarity score

In [18]:
print(colored('Most similar reviewers', attrs=['bold']))
eu_list=[]

for name in name_combinations:
    eu_list.append([name[0], name[1], eucledian(name[0], name[1])]) ## appending list of columns of reviewers and correlation of the reviewers

eu=pd.DataFrame(eu_list, columns=['Name1','Name2', 'Similarity']) ## converting lists to dataframe
eu= eu.dropna(axis=0) ## droping columns with NA (NA is for correlation of same reviews)
eu= eu.sort_values(by='Similarity', ascending= False).round(2).reset_index(drop=True) ##arranging in descending and order and rounding an dataframe to last two decimals

print('\n')
print(eu)
print('\n')
movie_finder(eu, token)
print('\n')

[1mMost similar reviewers[0m


    Name1   Name2  Similarity
0   Billy  Rachel        0.52
1    Andy  Rachel        0.47
2  Rachel  Alicia        0.43
3  Angela  Rachel        0.36
4  Rachel    Will        0.34
5    John  Rachel        0.31
6  Rachel  Isabel        0.27


[1mThe most recommended movie for the user Rachel are:[0m
{'The Departed', 'Platoon', 'The Shawshank Redemption', 'The Usual Suspects', 'Inglourious Basterds'}


