In [133]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [134]:
#reading the data
movies=pd.read_csv('movies.csv')

ratings=pd.read_csv('ratings.csv')

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [135]:
#checking the shape of the data
movies.shape

(34208, 3)

In [136]:
#creating a new column of year
movies['year']=movies.title.str.extract('(\d\d\d\d)',expand=False)
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [137]:
#removing the year from titles
movies['title']=movies.title.str.replace('(\(\d\d\d\d\))','')
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [138]:
#removing the spaces in title column
movies['title']=movies['title'].apply(lambda x:x.strip())

In [139]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [140]:
#splitting genres and creating a list of it
movies['genres']=movies.genres.str.split('|')
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [141]:
#creating new rows of genres
movies_with_genres=movies.copy()
for index,row in movies.iterrows():
    for genre in row['genres']:
        movies_with_genres.at[index,genre]=1
#where it will classify by 1, 0 i.e.-1=yes, 0=no
movies_with_genres=movies_with_genres.fillna(0)

movies_with_genres.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [143]:
rating=ratings.drop('timestamp',axis=1,inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [145]:
# user_input = [{'title':'Breakfast Club, The', 'rating':5},
#     {'title':'Toy Story','rating':3.5},
#     {'title':'Jumanji','rating':2}
#     {'title':'Pulp Fiction', 'rating':5},
#     {'title':'Ankita', 'rating':4.5}
#     ]

#creating a user profile
input_movies=pd.DataFrame()
input_movies['title'] = ['Breakfast Club, The','Toy Story','Jumanji','Pulp Fiction','Akira']
input_movies['rating'] = [5,3.5,2,5,4.5]
input_movies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [146]:
#getting the movie ID 
input_ID=movies[movies['title'].isin(input_movies['title'].tolist())]

input_movies=pd.merge(input_ID,input_movies)
input_movies.drop(['genres','year'],axis=1, inplace=True)
input_movies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [147]:
#getting the genres of the movies
user_movies=movies_with_genres[movies_with_genres['movieId'].isin(input_movies['movieId'].tolist())]
user_movies.head(2)

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
#only keeing the genres of the movies
user_movies=user_movies.reset_index(drop=True)

user_movies.drop(['movieId','title','genres','year'],axis=1,inplace=True)

user_movies

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:

user_movies.T

Unnamed: 0,0,1,2,3,4
Adventure,1.0,1.0,0.0,1.0,0.0
Animation,1.0,0.0,0.0,1.0,0.0
Children,1.0,1.0,0.0,0.0,0.0
Comedy,1.0,0.0,1.0,0.0,1.0
Fantasy,1.0,1.0,0.0,0.0,0.0
Romance,0.0,0.0,0.0,0.0,0.0
Drama,0.0,0.0,1.0,0.0,1.0
Action,0.0,0.0,0.0,1.0,0.0
Crime,0.0,0.0,1.0,0.0,0.0
Thriller,0.0,0.0,1.0,0.0,0.0


In [103]:
user_movies.shape

(5, 20)

In [104]:
input_movies['rating'].shape

(5,)

In [105]:
input_movies['rating']

0    3.5
1    2.0
2    5.0
3    4.5
4    5.0
Name: rating, dtype: float64

In [149]:
#creating the metrix so that we can get the weightage or preference of the genre user like
UserProfile= user_movies.transpose().dot(input_movies['rating'])

In [150]:
UserProfile

Adventure             10.0
Animation              8.0
Children               5.5
Comedy                13.5
Fantasy                5.5
Romance                0.0
Drama                 10.0
Action                 4.5
Crime                  5.0
Thriller               5.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 4.5
IMAX                   0.0
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [152]:
#creating genre table and making the movieID as index
genre_table=movies_with_genres.set_index(movies_with_genres['movieId'])
#dropping unncessary columns
genre_table.drop(['movieId','title','genres','year'],axis=1,inplace=True)
genre_table

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
151703,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151709,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
genre_table.shape

(34208, 20)

In [110]:
UserProfile.shape

(20,)

In [153]:
#Normalizing the data
recommendation=(genre_table*UserProfile).sum(axis=1)/UserProfile.sum()
recommendation

movieId
1         0.594406
2         0.293706
3         0.188811
4         0.328671
5         0.188811
            ...   
151697    0.069930
151701    0.000000
151703    0.139860
151709    0.202797
151711    0.000000
Length: 34208, dtype: float64

In [154]:
#top 10 movis 
recommendation_=recommendation.sort_values(ascending=False)[:10]
recommendation_

movieId
5018      0.748252
26093     0.734266
27344     0.720280
148775    0.685315
6902      0.678322
117646    0.678322
64645     0.671329
81132     0.671329
122787    0.671329
2987      0.664336
dtype: float64

In [156]:
#creating new dataframe of top 10 recommendation 
rec_key=[]
rec_val=[]
for i,e in zip(recommendation_.keys(),recommendation_):
    rec_key.append(i)
    rec_val.append(e)
recommendation_2=pd.DataFrame(list(zip(rec_key,rec_val)),columns=['movieId','weight'])

In [158]:
recommendation_2

Unnamed: 0,movieId,weight
0,5018,0.748252
1,26093,0.734266
2,27344,0.72028
3,148775,0.685315
4,6902,0.678322
5,117646,0.678322
6,64645,0.671329
7,81132,0.671329
8,122787,0.671329
9,2987,0.664336


In [159]:
#merging weifgt dataframe on movie ID
recommendation_df=pd.merge(recommendation_df,recommendation_2, on='movieId')

In [160]:
#sorting dataframe as per the weight 
recommendation_df.sort_values(by=['weight'],ascending=False).reset_index(drop=True)

Unnamed: 0,movieId,title,genres,year,weight
0,5018,Motorama,"[Adventure, Comedy, Crime, Drama, Fantasy, Mys...",1991,0.748252
1,26093,"Wonderful World of the Brothers Grimm, The","[Adventure, Animation, Children, Comedy, Drama...",1962,0.734266
2,27344,Revolutionary Girl Utena: Adolescence of Utena...,"[Action, Adventure, Animation, Comedy, Drama, ...",1999,0.72028
3,148775,Wizards of Waverly Place: The Movie,"[Adventure, Children, Comedy, Drama, Fantasy, ...",2009,0.685315
4,6902,Interstate 60,"[Adventure, Comedy, Drama, Fantasy, Mystery, S...",2002,0.678322
5,117646,Dragonheart 2: A New Beginning,"[Action, Adventure, Comedy, Drama, Fantasy, Th...",2000,0.678322
6,64645,The Wrecking Crew,"[Action, Adventure, Comedy, Crime, Drama, Thri...",1968,0.671329
7,81132,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film...",2010,0.671329
8,122787,The 39 Steps,"[Action, Adventure, Comedy, Crime, Drama, Thri...",1959,0.671329
9,2987,Who Framed Roger Rabbit?,"[Adventure, Animation, Children, Comedy, Crime...",1988,0.664336
