# Importing required libraries

In [25]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Importing dataset

In [26]:
ratings = pd.read_csv('ratings.csv')
movie_details= pd.read_csv('movies.csv')

In [27]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [28]:
movie_details.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# Checking for null entries and dropping those entries, if any

In [29]:
movie_details.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [30]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [31]:
# there is no null entries in both the tables

# Shape of datasets

In [33]:
movie_details.shape

(9742, 3)

In [22]:
ratings.shape

(100836, 4)

# Looking at the mathematical view of ratings dataset 

In [23]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [24]:
# Since we have 2 datasets here so here we need to merge it so that our working on it will be easy

# Merging both the datasets to form a single dataset

In [35]:
movies = pd.merge(ratings , movie_details, on= 'movieId')
movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


# Changing timestamp to proper datetime

# import datetime

In [87]:
from datetime import datetime, date

In [88]:
movies['datetime']= pd.to_datetime(movies['timestamp'])

In [89]:
movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,datetime
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1970-01-01 00:00:00.964982703
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1970-01-01 00:00:00.847434962
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1970-01-01 00:00:01.106635946
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1970-01-01 00:00:01.510577970
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1970-01-01 00:00:01.305696483


# Shape of our new converted dataset 

In [90]:
movies.shape

(100836, 7)

# Finding number of unique entries in each column

In [91]:
movies.nunique()

userId         610
movieId       9724
rating          10
timestamp    85043
title         9719
genres         951
datetime     85043
dtype: int64

# Mathematical overview of new dataset

In [92]:
movies.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


# Finding the average rating of movies

In [93]:
movie_avg = movies.groupby('title')['rating'].mean()
movie_avg

title
'71 (2014)                                   4.000000
'Hellboy': The Seeds of Creation (2004)      4.000000
'Round Midnight (1986)                       3.500000
'Salem's Lot (2004)                          5.000000
'Til There Was You (1997)                    4.000000
                                               ...   
eXistenZ (1999)                              3.863636
xXx (2002)                                   2.770833
xXx: State of the Union (2005)               2.000000
¡Three Amigos! (1986)                        3.134615
À nous la liberté (Freedom for Us) (1931)    1.000000
Name: rating, Length: 9719, dtype: float64

# Merging this average rating to the dataset

In [94]:
movie = pd.merge(movieavg , movies, on= 'movieId')
movie.head()

Unnamed: 0,movieId,avgrating,userId,rating,timestamp,title,genres,datetime
0,1,3.92093,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1970-01-01 00:00:00.964982703
1,1,3.92093,5,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1970-01-01 00:00:00.847434962
2,1,3.92093,7,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1970-01-01 00:00:01.106635946
3,1,3.92093,15,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1970-01-01 00:00:01.510577970
4,1,3.92093,17,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1970-01-01 00:00:01.305696483


#   Finding total number of ratings for a movie

In [95]:
number_ratings = movies.groupby('title')['rating'].count()
number_ratings.head()

title
'71 (2014)                                 1
'Hellboy': The Seeds of Creation (2004)    1
'Round Midnight (1986)                     2
'Salem's Lot (2004)                        1
'Til There Was You (1997)                  2
Name: rating, dtype: int64

# Making a new dataframe

In [96]:
new_record = pd.DataFrame()
new_record['Total Umber Of Ratings']= number_ratings
new_record['Average rating']= movie_avg

In [97]:
new_record.head()

Unnamed: 0_level_0,Total Umber Of Ratings,Average rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),1,4.0
'Hellboy': The Seeds of Creation (2004),1,4.0
'Round Midnight (1986),2,3.5
'Salem's Lot (2004),1,5.0
'Til There Was You (1997),2,4.0


# Making recomendation system

In [98]:
# sorting values according to the number of rating column

In [99]:
movie_matrix = movies.pivot_table(index = 'userId', columns = 'title', values = 'rating')
movie_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [100]:
movie_matrix.columns

Index([''71 (2014)', ''Hellboy': The Seeds of Creation (2004)',
       ''Round Midnight (1986)', ''Salem's Lot (2004)',
       ''Til There Was You (1997)', ''Tis the Season for Love (2015)',
       ''burbs, The (1989)', ''night Mother (1986)',
       '(500) Days of Summer (2009)', '*batteries not included (1987)',
       ...
       'Zulu (2013)', '[REC] (2007)', '[REC]² (2009)',
       '[REC]³ 3 Génesis (2012)',
       'anohana: The Flower We Saw That Day - The Movie (2013)',
       'eXistenZ (1999)', 'xXx (2002)', 'xXx: State of the Union (2005)',
       '¡Three Amigos! (1986)', 'À nous la liberté (Freedom for Us) (1931)'],
      dtype='object', name='title', length=9719)

In [101]:
# Rating for any particular movie

In [102]:
movie_matrix['Jumanji (1995)']

userId
1      NaN
2      NaN
3      NaN
4      NaN
5      NaN
      ... 
606    NaN
607    NaN
608    2.0
609    NaN
610    NaN
Name: Jumanji (1995), Length: 610, dtype: float64

In [103]:
Jumanji_1995 = movie_matrix['Jumanji (1995)']
Jumanji_1995.tail()

userId
606    NaN
607    NaN
608    2.0
609    NaN
610    NaN
Name: Jumanji (1995), dtype: float64

In [104]:
 #Finding similarity with other movies

In [105]:
movies_similar_to_jumanji_1995 = movie_matrix.corrwith(Jumanji_1995)
movies_similar_to_jumanji_1995

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


title
'71 (2014)                                        NaN
'Hellboy': The Seeds of Creation (2004)           NaN
'Round Midnight (1986)                            NaN
'Salem's Lot (2004)                               NaN
'Til There Was You (1997)                         NaN
                                               ...   
eXistenZ (1999)                              0.209165
xXx (2002)                                   0.122762
xXx: State of the Union (2005)               0.000000
¡Three Amigos! (1986)                        0.667614
À nous la liberté (Freedom for Us) (1931)         NaN
Length: 9719, dtype: float64

In [106]:
# The more is the correlation the more is movie similar to the other one

In [107]:
# using dataframe to see correlation

In [108]:
jumanji_1995_corr = pd.DataFrame(movies_similar_to_jumanji_1995, columns=['Correlation'])
jumanji_1995_corr.dropna(inplace=True)
jumanji_1995_corr.head()

Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
"'burbs, The (1989)",0.120173
(500) Days of Summer (2009),0.397966
*batteries not included (1987),0.719636
10 Cent Pistol (2015),-1.0
10 Cloverfield Lane (2016),1.0


In [109]:
jumanji_1995_corr.descending = jumanji_1995_corr.sort_values(['Correlation'], ascending = [False])
jumanji_1995_corr.descending.head()

  jumanji_1995_corr.descending = jumanji_1995_corr.sort_values(['Correlation'], ascending = [False])


Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
Paper Towns (2015),1.0
"Ice Storm, The (1997)",1.0
Stealing Beauty (1996),1.0
"Man Who Would Be King, The (1975)",1.0
Outlander (2008),1.0


These are 5 most similar movies to Jumanji(1995)

# Predicting some more movies

In [114]:
sympathy_1971=movie_matrix['Sympathy for the Underdog (1971)']
sympathy_1971.tail()

userId
606    NaN
607    NaN
608    NaN
609    NaN
610    4.5
Name: Sympathy for the Underdog (1971), dtype: float64

In [115]:
similar_to_sympathy=movie_matrix.corrwith(sympathy_1971)
similar_to_sympathy

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


title
'71 (2014)                                  NaN
'Hellboy': The Seeds of Creation (2004)     NaN
'Round Midnight (1986)                      NaN
'Salem's Lot (2004)                         NaN
'Til There Was You (1997)                   NaN
                                             ..
eXistenZ (1999)                             NaN
xXx (2002)                                  NaN
xXx: State of the Union (2005)              NaN
¡Three Amigos! (1986)                       NaN
À nous la liberté (Freedom for Us) (1931)   NaN
Length: 9719, dtype: float64

In [118]:
sympathy_1971_corr=pd.DataFrame(similar_to_sympathy, columns=['Correlation'])
sympathy_1971_corr

Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
'71 (2014),
'Hellboy': The Seeds of Creation (2004),
'Round Midnight (1986),
'Salem's Lot (2004),
'Til There Was You (1997),
...,...
eXistenZ (1999),
xXx (2002),
xXx: State of the Union (2005),
¡Three Amigos! (1986),
