# Movie Recommendation System based on User Profile - Content Based Recommendation System


### Import Libraries

In [204]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Importing Files 

### Import credits csv file - contains movie_id, title, cast, crew 

In [205]:
# Read credits csv file
credits = pd.read_csv("data/dataset2/tmdb_5000_credits.csv")
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


### Import movies csv file - contains data regarding the movie - genres, keywords, popularity etc

In [206]:
# Read movies csv file
movies = pd.read_csv("data/dataset2/tmdb_5000_movies.csv")
movies.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


## Drop unnecessary columns

### Dropping few columns in movies dataframe

In [207]:
#Removing unnecessary columns in movies
movies = movies.drop(['budget', 'homepage', 'original_language', 'revenue', 'release_date', 'runtime', 'spoken_languages', 'production_countries', 'status', 'title', 'tagline'], axis = 1)
movies.head(3)

Unnamed: 0,genres,id,keywords,original_title,overview,popularity,production_companies,vote_average,vote_count
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",7.2,11800
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",6.9,4500
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",6.3,4466


### Dropping title and crew columns from credits dataframe

In [208]:
#Removing unnecessary columns in credits
credits = credits.drop(['title', 'crew'], axis = 1)
credits.head(3)

Unnamed: 0,movie_id,cast
0,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""..."
1,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa..."
2,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr..."


## Merge Movies and credits dataframes

### Rename id column of movies dataframe - for merging

In [209]:
#Rename id coulmn of movies dataframe
movies = movies.rename(columns={"id": "movie_id"})
movies.head(3)

Unnamed: 0,genres,movie_id,keywords,original_title,overview,popularity,production_companies,vote_average,vote_count
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",7.2,11800
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",6.9,4500
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",6.3,4466


### Merge the two dataframes

In [210]:
#Merge movies and credits tables
movies = movies.merge(credits, on='movie_id')
movies.head()

Unnamed: 0,genres,movie_id,keywords,original_title,overview,popularity,production_companies,vote_average,vote_count,cast
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa..."
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",6.3,4466,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr..."
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",7.6,9106,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba..."
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",6.1,2124,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c..."


## Converting JSON Columns to string of words

### 1. JSON to String of words - Keywords column

In [211]:
#Getting the keywords as a string of words from keywords column 
keywordsl = []
for k in range(len(movies['keywords'])):
    #print(k)
    keywords_json = movies['keywords'][k]
    if (keywords_json == '[]'):
        keywordsl.append('')
    else:
        keywordsdf = pd.read_json(keywords_json)
        keywords_list = keywordsdf['name'].tolist()
        keywords_str = ' '.join(keywords_list)
        keywordsl.append(keywords_str)
        
#Checking the size of new keywords to actual table
print(len(keywordsl))
print(movies.shape)

#Change the keywords json column to parsed keywords 
movies['keywords'] = keywordsl

4803
(4803, 10)


### 2. JSON to String of words - genres column

In [212]:
#Getting the genres as a string of words from genres column 
genresl = []
for k in range(len(movies['genres'])):
    #print(k)
    genres_json = movies['genres'][k]
    if (genres_json == '[]'):
        genresl.append('')
    else:
        genresdf = pd.read_json(genres_json)
        genres_list = genresdf['name'].tolist()
        genres_str = ' '.join(genres_list)
        genresl.append(genres_str)
        
#Checking the size of new keywords to actual table
print(len(genresl))
print(movies.shape)

#Change the keywords json column to parsed keywords 
movies['genres'] = genresl

4803
(4803, 10)


### 3. JSON to String of words - production_companies column

In [213]:
#Getting the production_companies as a string of words from production_companies column 
production_companiesl = []
for k in range(len(movies['production_companies'])):
    #print(k)
    production_companies_json = movies['production_companies'][k]
    if (production_companies_json == '[]'):
        production_companiesl.append('')
    else:
        production_companiesdf = pd.read_json(production_companies_json)
        production_companies_list = production_companiesdf['name'].tolist()
        production_companies_str = ';'.join(production_companies_list)
        production_companiesl.append(production_companies_str)
        
#Checking the size of new keywords to actual table
print(len(production_companiesl))
print(movies.shape)

#Change the keywords json column to parsed keywords 
movies['production_companies'] = production_companiesl

4803
(4803, 10)


### 4. JSON to String of words - cast column

In [214]:
#Getting the cast as a string of words from cast column 
castl = []
for k in range(len(movies['cast'])):
    #print(k)
    cast_json = movies['cast'][k]
    if (cast_json == '[]'):
        castl.append('')
    else:
        castdf = pd.read_json(cast_json)
        cast_list = castdf['name'].tolist()
        cast_str = ';'.join(cast_list)
        castl.append(cast_str)
        
#Checking the size of new keywords to actual table
print(len(castl))
print(movies.shape)

#Change the keywords json column to parsed keywords 
movies['cast'] = castl

4803
(4803, 10)


### After converting from JSON

In [215]:
movies.head()

Unnamed: 0,genres,movie_id,keywords,original_title,overview,popularity,production_companies,vote_average,vote_count,cast
0,Action Adventure Fantasy Science Fiction,19995,culture clash future space war space colony so...,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,Ingenious Film Partners;Twentieth Century Fox ...,7.2,11800,Sam Worthington;Zoe Saldana;Sigourney Weaver;S...
1,Adventure Fantasy Action,285,ocean drug abuse exotic island east india trad...,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,Walt Disney Pictures;Jerry Bruckheimer Films;S...,6.9,4500,Johnny Depp;Orlando Bloom;Keira Knightley;Stel...
2,Action Adventure Crime,206647,spy based on novel secret agent sequel mi6 bri...,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,Columbia Pictures;Danjaq;B24,6.3,4466,Daniel Craig;Christoph Waltz;Léa Seydoux;Ralph...
3,Action Crime Drama Thriller,49026,dc comics crime fighter terrorist secret ident...,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,Legendary Pictures;Warner Bros.;DC Entertainme...,7.6,9106,Christian Bale;Michael Caine;Gary Oldman;Anne ...
4,Action Adventure Science Fiction,49529,based on novel mars medallion space travel pri...,John Carter,"John Carter is a war-weary, former military ca...",43.926995,Walt Disney Pictures,6.1,2124,Taylor Kitsch;Lynn Collins;Samantha Morton;Wil...


### Information about Movies Dataframe

In [216]:
print(movies.shape)
print(movies.info())

(4803, 10)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 10 columns):
genres                  4803 non-null object
movie_id                4803 non-null int64
keywords                4803 non-null object
original_title          4803 non-null object
overview                4800 non-null object
popularity              4803 non-null float64
production_companies    4803 non-null object
vote_average            4803 non-null float64
vote_count              4803 non-null int64
cast                    4803 non-null object
dtypes: float64(2), int64(2), object(6)
memory usage: 572.8+ KB
None


## Import and filter ratings csv file

### Import ratings cvs file

In [217]:
# Read ratings csv file
ratings = pd.read_csv("data/dataset1/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


### Drop unnecessary columns

In [218]:
#Removing unnecessary columns in ratings
ratings = ratings.drop(['timestamp'], axis = 1)
ratings.head(3)

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0


### Rename movieid column 

In [219]:
#Rename id coulmn of ratings dataframe
ratings = ratings.rename(columns={"movieId": "movie_id"})
ratings.head(3)

Unnamed: 0,userId,movie_id,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0


### Info of Ratings file

In [220]:
print(ratings.shape)
print(ratings.info())

(26024289, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 3 columns):
userId      int64
movie_id    int64
rating      float64
dtypes: float64(1), int64(2)
memory usage: 595.6 MB
None


### Checking for empty rows and columns

In [221]:
#checking empty columns in dataframe
ratings.isnull().sum(axis = 0)

userId      0
movie_id    0
rating      0
dtype: int64

In [222]:
#checking empty rows in dataframe
ratings.isnull().sum(axis = 1)

0           0
1           0
2           0
3           0
4           0
5           0
6           0
7           0
8           0
9           0
10          0
11          0
12          0
13          0
14          0
15          0
16          0
17          0
18          0
19          0
20          0
21          0
22          0
23          0
24          0
25          0
26          0
27          0
28          0
29          0
           ..
26024259    0
26024260    0
26024261    0
26024262    0
26024263    0
26024264    0
26024265    0
26024266    0
26024267    0
26024268    0
26024269    0
26024270    0
26024271    0
26024272    0
26024273    0
26024274    0
26024275    0
26024276    0
26024277    0
26024278    0
26024279    0
26024280    0
26024281    0
26024282    0
26024283    0
26024284    0
26024285    0
26024286    0
26024287    0
26024288    0
Length: 26024289, dtype: int64

## Creating USER Profiles and ITEM Profiles - GENRES

### 1. Creating Item Profiles 

### Getting Genres List

In [223]:
#initializing genres list
genres_list = []
genres_list.append('movie_id')
genres_list.append('genres')

In [224]:
#get total genres list
listg = []
for k in range(len(movies['genres'])):
    #print(k)
    listg = movies['genres'][k].split() #split string into a list
    #print(listg)
    for genre in listg:
        if (genre not in genres_list):
            genres_list.append(genre)
print(genres_list)

['movie_id', 'genres', 'Action', 'Adventure', 'Fantasy', 'Science', 'Fiction', 'Crime', 'Drama', 'Thriller', 'Animation', 'Family', 'Western', 'Comedy', 'Romance', 'Horror', 'Mystery', 'History', 'War', 'Music', 'Documentary', 'Foreign', 'TV', 'Movie']


### Initializing genres list as columns along with movie_id and genres

In [225]:
movie_generes_df = pd.DataFrame(columns = genres_list) 
movie_generes_df

Unnamed: 0,movie_id,genres,Action,Adventure,Fantasy,Science,Fiction,Crime,Drama,Thriller,...,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV,Movie


### Filling movie_id and genres with data from movies

In [226]:
movie_generes_df['movie_id'] = movies['movie_id']
movie_generes_df['genres'] = movies['genres']
movie_generes_df.head(3)

Unnamed: 0,movie_id,genres,Action,Adventure,Fantasy,Science,Fiction,Crime,Drama,Thriller,...,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV,Movie
0,19995,Action Adventure Fantasy Science Fiction,,,,,,,,,...,,,,,,,,,,
1,285,Adventure Fantasy Action,,,,,,,,,...,,,,,,,,,,
2,206647,Action Adventure Crime,,,,,,,,,...,,,,,,,,,,


### Fill values - Contains a genre: 1.0; else 0.0

In [227]:
#fill values
listg = []
for k in range(len(movie_generes_df['movie_id'])):
    #print(k)
    listg = movies['genres'][k].split() #split string into a list
    #print(listg)
    for genre in listg:
        movie_generes_df[genre][k] = 1.0
movie_generes_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,movie_id,genres,Action,Adventure,Fantasy,Science,Fiction,Crime,Drama,Thriller,...,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV,Movie
0,19995,Action Adventure Fantasy Science Fiction,1,1,1.0,1.0,1.0,,,,...,,,,,,,,,,
1,285,Adventure Fantasy Action,1,1,1.0,,,,,,...,,,,,,,,,,
2,206647,Action Adventure Crime,1,1,,,,1.0,,,...,,,,,,,,,,


### Fill NaN with 0

In [228]:
movie_generes_df = movie_generes_df.fillna(0)
movie_generes_df.head(3)

Unnamed: 0,movie_id,genres,Action,Adventure,Fantasy,Science,Fiction,Crime,Drama,Thriller,...,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV,Movie
0,19995,Action Adventure Fantasy Science Fiction,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,285,Adventure Fantasy Action,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,206647,Action Adventure Crime,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2. Creating User Profile

In [229]:
#Movies rated by certain user
ratingswithuser = ratings.loc[ratings['userId'] == 2]
ratingswithuser

Unnamed: 0,userId,movie_id,rating
27,2,5,3.0
28,2,25,3.0
29,2,32,2.0
30,2,58,3.0
31,2,64,4.0
32,2,79,4.0
33,2,141,3.0
34,2,260,4.0
35,2,339,5.0
36,2,377,4.0


### Get subset from movie_generes_df - Subset is Movies the user rated

In [230]:
#Movies data available for the rated movies
movie_genereswithuser = movie_generes_df.loc[movie_generes_df['movie_id'].isin(ratingswithuser['movie_id'])]
movie_genereswithuser

Unnamed: 0,movie_id,genres,Action,Adventure,Fantasy,Science,Fiction,Crime,Drama,Thriller,...,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV,Movie
12,58,Adventure Fantasy Action,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123,605,Adventure Action Thriller Science Fiction,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
557,25,Drama War,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
765,786,Drama Music,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
912,628,Horror Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1136,79,Drama Adventure Action History,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1378,377,Horror,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1811,788,Comedy Drama Family,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3706,141,Fantasy Drama Mystery,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3766,5,Crime Comedy,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Getting the ratings only for the movies data that is available

In [231]:
#Cutting down movies with available movie data
ratingswithuser = ratingswithuser.loc[ratingswithuser['movie_id'].isin(movie_genereswithuser['movie_id'])]
ratingswithuser

Unnamed: 0,userId,movie_id,rating
27,2,5,3.0
28,2,25,3.0
30,2,58,3.0
32,2,79,4.0
33,2,141,3.0
36,2,377,4.0
37,2,605,4.0
38,2,628,4.0
40,2,762,3.0
42,2,786,1.0


### Merge ratingswithuser and movie_genereswithuser

In [232]:
#Merge movies and credits tables
mergeduserdata = ratingswithuser.merge(movie_genereswithuser, on='movie_id')
mergeduserdata

Unnamed: 0,userId,movie_id,rating,genres,Action,Adventure,Fantasy,Science,Fiction,Crime,...,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV,Movie
0,2,5,3.0,Crime Comedy,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,25,3.0,Drama War,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2,58,3.0,Adventure Fantasy Action,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,79,4.0,Drama Adventure Action History,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,141,3.0,Fantasy Drama Mystery,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2,377,4.0,Horror,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2,605,4.0,Adventure Action Thriller Science Fiction,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2,628,4.0,Horror Romance,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2,762,3.0,Adventure Comedy Fantasy,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2,786,1.0,Drama Music,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### Get only genres names list - remove movie_id and genres

In [233]:
#Get only genres names list
only_genres = []
only_genres = genres_list.copy()
only_genres.remove('movie_id')
only_genres.remove('genres')

### Creating number of movies, their movie ids, their ratings, average user rating for particular genre

In [234]:
#Creating the user profile dataframe
userProfile = pd.DataFrame(only_genres, columns = ['genres']) 
userProfile['number of movies'] = 0
userProfile['movie ids'] = 0
userProfile['ratings'] = 0
userProfile['average user rating'] = 0
userProfile['values'] = 0

for i in userProfile['genres']:
    #index value of genre (i)
    k = userProfile.index[userProfile['genres'] == i]
    
    #Sum of movies with genre i
    userProfile.iloc[k, 1] = movie_genereswithuser[i].sum()
    
    #Movie ids with genres i
    df_m = mergeduserdata.loc[mergeduserdata[i] > 0]['movie_id'] #Movies list with genre i(>0 meaning the movie contains a ceratin genre)
    ids_m = df_m.values.tolist() #convert dataframe to list
    idsm = [str(i) for i in ids_m] #convert list of ints to list of string
    movie_ids = ' '.join(idsm) #concatinate as a string
    userProfile.iloc[k, 2] = str(movie_ids)
    
    #Movie ratings with genres i
    df_r = mergeduserdata.loc[mergeduserdata[i] > 0]['rating'] #Movies list with genre i(>0 meaning the movie contains a ceratin genre)
    ids_r = df_r.values.tolist() #convert dataframe to list
    idsr = [str(i) for i in ids_r] #convert list of ints to list of string
    ratings_ = ' '.join(idsr) #concatinate as a string
    userProfile.iloc[k, 3] = str(ratings_)
    
    #Average user rating
    user_avg_rating = mergeduserdata['rating'].mean()
    userProfile['average user rating'] = user_avg_rating
    
userProfile

Unnamed: 0,genres,number of movies,movie ids,ratings,average user rating,values
0,Action,3.0,58 79 605,3.0 4.0 4.0,3.0,0
1,Adventure,4.0,58 79 605 762,3.0 4.0 4.0 3.0,3.0,0
2,Fantasy,3.0,58 141 762,3.0 3.0 3.0,3.0,0
3,Science,1.0,605,4.0,3.0,0
4,Fiction,1.0,605,4.0,3.0,0
5,Crime,1.0,5,3.0,3.0,0
6,Drama,5.0,25 79 141 786 788,3.0 4.0 3.0 1.0 1.0,3.0,0
7,Thriller,1.0,605,4.0,3.0,0
8,Animation,0.0,,,3.0,0
9,Family,1.0,788,1.0,3.0,0


### Generate their values(final value for particular genre) -> value = (ratings_sum - (noofmovies * avg_rating))/noofmovies

In [235]:
for k in range(len(userProfile)):
    if (userProfile.iloc[k, 1] > 0):
        ratingsString = userProfile.iloc[k, 3]
        ratingsList = list(ratingsString.split(" "))
        for i in range(0, len(ratingsList)):
            ratingsList[i] = float(ratingsList[i])
        ratings_sum  = sum(ratingsList)
        #print(ratings_sum)
        noofmovies = userProfile.iloc[k, 1]
        avg_rating = userProfile.iloc[k, 4]
        userProfile.iloc[k, 5] = (ratings_sum - (noofmovies * avg_rating))/noofmovies
    else:
        userProfile.iloc[k, 5] = 0
        
userProfile

Unnamed: 0,genres,number of movies,movie ids,ratings,average user rating,values
0,Action,3.0,58 79 605,3.0 4.0 4.0,3.0,0.666667
1,Adventure,4.0,58 79 605 762,3.0 4.0 4.0 3.0,3.0,0.5
2,Fantasy,3.0,58 141 762,3.0 3.0 3.0,3.0,0.0
3,Science,1.0,605,4.0,3.0,1.0
4,Fiction,1.0,605,4.0,3.0,1.0
5,Crime,1.0,5,3.0,3.0,0.0
6,Drama,5.0,25 79 141 786 788,3.0 4.0 3.0 1.0 1.0,3.0,-0.6
7,Thriller,1.0,605,4.0,3.0,1.0
8,Animation,0.0,,,3.0,0.0
9,Family,1.0,788,1.0,3.0,-2.0


### User Profile Vector

In [236]:
#Converting columns to Numpy Arrays - Getting User profile and Item profile Vectors

#User Profile
userProfileVector = userProfile['values'].to_numpy()
userProfileVector = np.array(userProfileVector)
userProfileVector = userProfileVector.reshape(1,-1)
userProfileVector

array([[ 0.66666667,  0.5       ,  0.        ,  1.        ,  1.        ,
         0.        , -0.6       ,  1.        ,  0.        , -2.        ,
         0.        , -0.66666667,  1.        ,  1.        ,  0.        ,
         1.        ,  0.        , -2.        ,  0.        ,  0.        ,
         0.        ,  0.        ]])

### Getting Movies the user hasn't watched

In [237]:
#Getting movies the user hasen't watched or rated
movieswithoutuser = movie_generes_df.loc[~movie_generes_df['movie_id'].isin(ratingswithuser['movie_id'])]
movieswithoutuser.head(3)

Unnamed: 0,movie_id,genres,Action,Adventure,Fantasy,Science,Fiction,Crime,Drama,Thriller,...,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV,Movie
0,19995,Action Adventure Fantasy Science Fiction,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,285,Adventure Fantasy Action,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,206647,Action Adventure Crime,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### COSINE SIMILARITY - User Profile Vector & Item Profile Vector

In [238]:
#Item Profile & Similarity
movieID_list = []
similarity_list = []
for k in range(len(movieswithoutuser)):
    movieID = movieswithoutuser.iloc[k, 0]
    movieID_list.append(movieID)
    movieProfileVector = movieswithoutuser.iloc[k, 2:].to_numpy()
    movieProfileVector = np.array(movieProfileVector)
    movieProfileVector = movieProfileVector.reshape(1,-1)
    similarity = cosine_similarity(userProfileVector, movieProfileVector)
    similarity_list.append(float(similarity[0]))

In [239]:
#Create dataframe for most similar movies
genres_similarity = pd.DataFrame(movieID_list, columns = ['movie_id']) 
genres_similarity['similarity_genres'] = similarity_list
genres_similarity

Unnamed: 0,movie_id,similarity
0,19995,0.359722
1,285,0.171094
2,206647,0.171094
3,49026,0.135472
4,49529,0.402181
5,559,0.171094
6,38757,-0.359223
7,99861,0.402181
8,767,-0.219979
9,209112,0.171094


### Movies the user watched

In [240]:
#Movies the user watched
userMovies = mergeduserdata[['userId', 'movie_id', 'rating']]
userMovies_df = userMovies.merge(movies, on='movie_id')
userMovies_df = userMovies_df.drop(['keywords', 'overview', 'popularity', 'production_companies', 'vote_average', 'vote_count', 'cast'], axis = 1)
userMovies_df

Unnamed: 0,userId,movie_id,rating,genres,original_title
0,2,5,3.0,Crime Comedy,Four Rooms
1,2,25,3.0,Drama War,Jarhead
2,2,58,3.0,Adventure Fantasy Action,Pirates of the Caribbean: Dead Man's Chest
3,2,79,4.0,Drama Adventure Action History,英雄
4,2,141,3.0,Fantasy Drama Mystery,Donnie Darko
5,2,377,4.0,Horror,A Nightmare on Elm Street
6,2,605,4.0,Adventure Action Thriller Science Fiction,The Matrix Revolutions
7,2,628,4.0,Horror Romance,Interview with the Vampire
8,2,762,3.0,Adventure Comedy Fantasy,Monty Python and the Holy Grail
9,2,786,1.0,Drama Music,Almost Famous


### Get Movie titles for the similar movies - Recommended Movies

In [241]:
#Merge movies and credits tables
similar_movies = genres_similarity.sort_values(by=['similarity_genres'], ascending=False)
similarMovies_df = similar_movies.merge(movies, on='movie_id')
similarMovies_df = similarMovies_df.drop(['keywords', 'overview', 'popularity', 'production_companies', 'vote_average', 'vote_count', 'cast'], axis = 1)
similarMovies_df.head(10)

Unnamed: 0,movie_id,similarity,genres,original_title
0,11237,0.535778,Adventure Action Horror Science Fiction Thriller,Anacondas: The Hunt for the Blood Orchid
1,72710,0.535778,Action Adventure Romance Science Fiction Thriller,The Host
2,10003,0.535778,Thriller Action Romance Science Fiction Adventure,The Saint
3,348,0.530117,Horror Action Thriller Science Fiction,Alien
4,71469,0.530117,Horror Action Thriller Science Fiction,The Darkest Hour
5,77156,0.530117,Horror Action Thriller Science Fiction,Alien Zone
6,679,0.530117,Horror Action Thriller Science Fiction,Aliens
7,15158,0.530117,Action Horror Science Fiction Thriller,Phantasm II
8,8337,0.530117,Action Horror Science Fiction Thriller,They Live
9,17577,0.530117,Action Horror Thriller Science Fiction,The Devil's Tomb


## Creating USER Profiles and ITEM Profiles - KEYWORDS

### Getting keywords for all the movies

In [242]:
movie_keywords_df = movies[['movie_id', 'original_title', 'keywords']]
movie_keywords_df.head()

Unnamed: 0,movie_id,original_title,keywords
0,19995,Avatar,culture clash future space war space colony so...
1,285,Pirates of the Caribbean: At World's End,ocean drug abuse exotic island east india trad...
2,206647,Spectre,spy based on novel secret agent sequel mi6 bri...
3,49026,The Dark Knight Rises,dc comics crime fighter terrorist secret ident...
4,49529,John Carter,based on novel mars medallion space travel pri...


### Filling NAN with empty string

In [243]:
# Filling NaNs with empty string
movie_keywords_df['keywords'] = movie_keywords_df['keywords'].fillna('')
movie_keywords_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,movie_id,original_title,keywords
0,19995,Avatar,culture clash future space war space colony so...
1,285,Pirates of the Caribbean: At World's End,ocean drug abuse exotic island east india trad...
2,206647,Spectre,spy based on novel secret agent sequel mi6 bri...
3,49026,The Dark Knight Rises,dc comics crime fighter terrorist secret ident...
4,49529,John Carter,based on novel mars medallion space travel pri...


### Fitting TF-IDF on Keywords 

In [244]:
# Fitting the TF-IDF on the 'keywords' text
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer  = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')
tfv_matrix = vectorizer.fit_transform(movie_keywords_df['keywords'])
#print(vectorizer.get_feature_names())

In [245]:
print(tfv_matrix.shape)

(4803, 5316)


### Sigmoid kernel to get similarity between each movies

In [246]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

# Reverse mapping of indices and movie titles
indices = pd.Series(movie_keywords_df.index, index=movie_keywords_df['original_title']).drop_duplicates()

### Function - To provide similarity scores for each movie the user watched through enumeration

In [247]:
def give_rec(title, rating, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Similarity scores with all the movies
    sig_scores = sig_scores[0:]
    
    # Movie indices
    movie_indices = [i[0] for i in sig_scores]
    movie_ids = movies['movie_id'].iloc[movie_indices]
    movie_scores = [i[1] for i in sig_scores]
    
    #Creating the user profile dataframe
    keywords_similarity[title] = movie_scores
    keywords_similarity[title] = rating * keywords_similarity[title]

### List of movies watched by a certain user - userId, Movieid, rating, title

In [248]:
#Movies rated by certain user
ratingswithuser_K = ratings.loc[ratings['userId'] == 2]

#Movies data available for the rated movies
movie_keywordswithuser = movies.loc[movies['movie_id'].isin(ratingswithuser_K['movie_id'])]
movie_keywordswithuser

#Cutting down movies with available movie data
ratingswithuser_K = ratingswithuser_K.loc[ratingswithuser_K['movie_id'].isin(movie_keywordswithuser['movie_id'])]

#Merge movies and credits tables
ratingswithuser_K = ratingswithuser_K.merge(movies, on='movie_id')
ratingswithuser_K = ratingswithuser_K[['userId', 'movie_id', 'rating', 'original_title']]
ratingswithuser_K

Unnamed: 0,userId,movie_id,rating,original_title
0,2,5,3.0,Four Rooms
1,2,25,3.0,Jarhead
2,2,58,3.0,Pirates of the Caribbean: Dead Man's Chest
3,2,79,4.0,英雄
4,2,141,3.0,Donnie Darko
5,2,377,4.0,A Nightmare on Elm Street
6,2,605,4.0,The Matrix Revolutions
7,2,628,4.0,Interview with the Vampire
8,2,762,3.0,Monty Python and the Holy Grail
9,2,786,1.0,Almost Famous


### Building a dataframe with movies names as columns. These columns names are the similarity scores with all the other movies

In [249]:
keywords_similarity = pd.DataFrame(columns = ratingswithuser_K['original_title']) 
keywords_similarity.columns.name = None
keywords_similarity.insert(0, 'movie_id', movies['movie_id'] )
keywords_similarity.head(3)

Unnamed: 0,movie_id,Four Rooms,Jarhead,Pirates of the Caribbean: Dead Man's Chest,英雄,Donnie Darko,A Nightmare on Elm Street,The Matrix Revolutions,Interview with the Vampire,Monty Python and the Holy Grail,Almost Famous,Mrs. Doubtfire
0,19995,,,,,,,,,,,
1,285,,,,,,,,,,,
2,206647,,,,,,,,,,,


### Calling the give_rec funtion which gives the similarity scores with these movies(movies in the columns i.e.,the ones the user watched)

In [250]:
for k in range(len(ratingswithuser_K)):
    titleM = ratingswithuser_K.iloc[k, 3]
    ratingM = ratingswithuser_K.iloc[k, 2]
    give_rec(titleM, ratingM)

In [251]:
keywords_similarity.head(3)

Unnamed: 0,movie_id,Four Rooms,Jarhead,Pirates of the Caribbean: Dead Man's Chest,英雄,Donnie Darko,A Nightmare on Elm Street,The Matrix Revolutions,Interview with the Vampire,Monty Python and the Holy Grail,Almost Famous,Mrs. Doubtfire
0,19995,2.284782,2.2848,2.284782,3.046377,2.284788,3.046377,3.04638,3.046377,2.284782,0.761594,0.761596
1,285,2.284789,2.284786,2.28492,3.046377,2.284782,3.046377,3.046378,3.046377,2.284782,0.761595,0.761594
2,206647,2.284782,2.284782,2.284782,3.046377,2.284782,3.046377,3.046377,3.046377,2.284782,0.761595,0.761594


### Adding all these similarity scores and normalizing them with the total rating of the user

In [252]:
keywords_similarity['similarity_keywords'] = keywords_similarity.iloc[:, 1:].sum(axis=1)

#Normalizing the similarity scores after total
ratings_sum = ratingswithuser_K['rating'].sum()
keywords_similarity['similarity_keywords'] = keywords_similarity['similarity_keywords'] / ratings_sum
keywords_similarity = keywords_similarity[['movie_id', 'similarity_keywords']]
keywords_similarity.head()

Unnamed: 0,movie_id,similarity_keywords
0,19995,0.761595
1,285,0.761599
2,206647,0.761594
3,49026,0.761594
4,49529,0.761594


### Deleting the movies from this dataframe and including only the movies the user hasn't watched

In [253]:
#Getting movies the user hasen't watched or rated
keywords_similarity = keywords_similarity.loc[~keywords_similarity['movie_id'].isin(ratingswithuser_K['movie_id'])]
keywords_similarity.head(10)

Unnamed: 0,movie_id,similarity_keywords
0,19995,0.761595
1,285,0.761599
2,206647,0.761594
3,49026,0.761594
4,49529,0.761594
5,559,0.761595
6,38757,0.761594
7,99861,0.761594
8,767,0.761595
9,209112,0.761594


### Soting the similarity scores in descending order

In [254]:
similar_movies_keywords = keywords_similarity.sort_values(by=['similarity_keywords'], ascending=False)
similar_movies_keywords.head(10)

Unnamed: 0,movie_id,similarity_keywords
125,604,0.761602
634,603,0.7616
2044,24100,0.7616
612,24021,0.761599
1599,293863,0.761599
2452,11093,0.761599
3473,15256,0.761599
3772,8141,0.761599
43,534,0.761599
1,285,0.761599


### RECOMMENDED Movies based on tf-idf of Keywords present in each movie

In [255]:
#Merge movies and credits tables
similar_movies_keywords_df = similar_movies_keywords.merge(movies, on='movie_id')
similar_movies_keywords_df = similar_movies_keywords_df.drop(['keywords', 'overview', 'popularity', 'production_companies', 'vote_average', 'vote_count', 'cast'], axis = 1)
similar_movies_keywords_df.head(10)

Unnamed: 0,movie_id,similarity_keywords,genres,original_title
0,604,0.761602,Adventure Action Thriller Science Fiction,The Matrix Reloaded
1,603,0.7616,Action Science Fiction,The Matrix
2,24100,0.7616,Family Horror,The Little Vampire
3,24021,0.761599,Adventure Fantasy Drama Romance,The Twilight Saga: Eclipse
4,293863,0.761599,Fantasy Drama Romance,The Age of Adaline
5,11093,0.761599,Drama,House of Sand and Fog
6,15256,0.761599,Comedy Drama Romance,200 Cigarettes
7,8141,0.761599,Comedy Crime Thriller Romance,You Kill Me
8,534,0.761599,Action Science Fiction Thriller,Terminator Salvation
9,285,0.761599,Adventure Fantasy Action,Pirates of the Caribbean: At World's End
