In [1]:
import numpy as np
import pandas as pd

In [2]:
# import movie data set and look at columns
movie = pd.read_csv("movie.csv")
movie.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [3]:
# what we need is that movie id and title
movie = movie.loc[:,["movieId","title"]]
movie.head(10)

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
5,6,Heat (1995)
6,7,Sabrina (1995)
7,8,Tom and Huck (1995)
8,9,Sudden Death (1995)
9,10,GoldenEye (1995)


In [4]:
# import rating data and look at columns
rating = pd.read_csv("rating.csv")
rating.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [5]:
# what we need is that user id, movie id and rating
rating = rating.loc[:,["userId","movieId","rating"]]
rating.head(5)

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [6]:
# then merge movie and rating data
data = pd.merge(movie,rating)

In [7]:
# now lets look at our data 
data.head(5)

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),3,4.0
1,1,Toy Story (1995),6,5.0
2,1,Toy Story (1995),8,4.0
3,1,Toy Story (1995),10,4.0
4,1,Toy Story (1995),11,4.5


In [8]:
data.shape

(20000263, 4)

In [9]:
data = data.iloc[:1000000,:]

In [10]:
# lets make a pivot table in order to make rows are users and columns are movies. And values are rating
pivot_table = data.pivot_table(index = ["userId"],columns = ["title"],values = "rating")
pivot_table.head(10)

title,Ace Ventura: When Nature Calls (1995),Across the Sea of Time (1995),"Amazing Panda Adventure, The (1995)","American President, The (1995)",Angela (1995),Angels and Insects (1995),Anne Frank Remembered (1995),Antonia's Line (Antonia) (1995),Assassins (1995),Babe (1995),...,Unforgettable (1996),Up Close and Personal (1996),"Usual Suspects, The (1995)",Vampire in Brooklyn (1995),Waiting to Exhale (1995),When Night Is Falling (1995),"White Balloon, The (Badkonake sefid) (1995)",White Squall (1996),Wings of Courage (1995),"Young Poisoner's Handbook, The (1995)"
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,3.5,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,5.0,,,,,,,
4,3.0,,,,,,,,,,...,,,,,,,,,,
5,,,,5.0,,,,,,,...,,2.0,,,,,,,,
6,,,,,,,,,,,...,,4.0,,,,,,,,
7,,,,4.0,,,,,,,...,,,,,,,,,,
8,1.0,,,,,,,,,,...,,,,,,,,,,
10,,,,4.0,,,,,,,...,,,,,,,,,,
11,3.5,,,,,,,,,,...,,,,,,,,,,


In [11]:
movie_watched = pivot_table["Bad Boys (1995)"]
similarity_with_other_movies = pivot_table.corrwith(movie_watched)  # find correlation between "Bad Boys (1995)" and other movies
similarity_with_other_movies = similarity_with_other_movies.sort_values(ascending=False)
similarity_with_other_movies.head()

title
Bad Boys (1995)                        1.000000
Headless Body in Topless Bar (1995)    0.723747
Last Summer in the Hamptons (1995)     0.607554
Two Bits (1995)                        0.507008
Shadows (Cienie) (1988)                0.494186
dtype: float64

* It can be concluded that we need to recommend "Headless Body in Topless Bar (1995)" movie to people who watched "Bad Boys (1995)".
* On the other hand even if we do not consider, number of rating for each movie is also important.