In [1]:
import numpy as np
import pandas as pd

In [2]:
from IPython.display import display, HTML      #for the horizontal scrollbar that was absent 
display(HTML("<style>.jp-OutputArea-output {display:flex}</style>"))

In [3]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [4]:
movies = movies.merge(credits,on='title')

As we are building recommender system we have to distinguish on which basis we are going to recommend the movies, i am distinguishing the basis of the movies on which i can program my recommender system. I have picked some of the fields, #genres #id #keywords #title #overview #cast #crew

In [5]:
#eliminating the un-necessary attributes and replacing it with new dataset
movies = movies[['movie_id','genres','title','overview','keywords','cast','crew']]

In [6]:
movies.isnull().sum()   #checking if dataset has any of the null values

movie_id    0
genres      0
title       0
overview    3
keywords    0
cast        0
crew        0
dtype: int64

In [7]:
movies.dropna(inplace=True)  #dropping null values

In [8]:
movies.duplicated().sum()   #checking if dataset has any of the duplicated values

0

In [9]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

The above output is in the form of list of dictonaries, so the above output should be changed in the form of ['Action','Adventure','Fantasy','Science Fiction']

In [10]:
import ast
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [11]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [12]:
movies.head(1)

Unnamed: 0,movie_id,genres,title,overview,keywords,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]",Avatar,"In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [13]:
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

In [14]:
movies['cast'] = movies['cast'].apply(convert3)

In [15]:
movies.head(1)

Unnamed: 0,movie_id,genres,title,overview,keywords,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]",Avatar,"In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


We only need the director of the crew members to recommend, hence we only fetch the name of director of the crew
column. We use helper function below for that purpose

In [16]:
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [17]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [18]:
movies.head(1)

Unnamed: 0,movie_id,genres,title,overview,keywords,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]",Avatar,"In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [19]:
movies.overview[0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [20]:
#we need to convert above sentence to the list, after that we can give tags to each of the words
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [21]:
movies.head(3)

Unnamed: 0,movie_id,genres,title,overview,keywords,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,"[Adventure, Fantasy, Action]",Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,"[Action, Adventure, Crime]",Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]


As we can see above there are two people named Sam Worthington and Sam Mendes, ihe recommender system can get confused 
because of the same name entity hence, we need to remove space between the names to create non-confusing tags. For that purpose we use lambda function

In [22]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ",'') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ",'') for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ",'') for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ",'') for i in x])

In [23]:
movies.head(1)

Unnamed: 0,movie_id,genres,title,overview,keywords,cast,crew
0,19995,"[Action, Adventure, Fantasy, ScienceFiction]",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [24]:
movies['tags'] = movies['overview']+ movies['keywords']+movies['cast']+movies['crew']

In [25]:
movies.head(1)

Unnamed: 0,movie_id,genres,title,overview,keywords,cast,crew,tags
0,19995,"[Action, Adventure, Fantasy, ScienceFiction]",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [26]:
new_df1 = movies[['movie_id','title','tags']]  #creating a new dataframe with new columns

In [27]:
new_df1.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [28]:
new_df1['tags'] = new_df1['tags'].apply(lambda x:" ".join(x)) #convert list into sentences

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df1['tags'] = new_df1['tags'].apply(lambda x:" ".join(x)) #convert list into sentences


In [29]:
new_df1

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."
...,...,...,...
4804,9367,El Mariachi,El Mariachi just wants to play his guitar and ...
4805,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...
4806,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic..."
4807,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...


In [30]:
new_df1['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

In [31]:
new_df1['tags'].apply(lambda x:x.lower())

0       in the 22nd century, a paraplegic marine is di...
1       captain barbossa, long believed to be dead, ha...
2       a cryptic message from bond’s past sends him o...
3       following the death of district attorney harve...
4       john carter is a war-weary, former military ca...
                              ...                        
4804    el mariachi just wants to play his guitar and ...
4805    a newlywed couple's honeymoon is upended by th...
4806    "signed, sealed, delivered" introduces a dedic...
4807    when ambitious new york attorney sam is sent t...
4808    ever since the second grade when he first saw ...
Name: tags, Length: 4806, dtype: object

In [32]:
new_df1['tags'] = new_df1['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df1['tags'] = new_df1['tags'].apply(lambda x:x.lower())


In [33]:
new_df1.head()   

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


For the recommender system we can use vectorization process in which all the words are converted into the 
vector form and the recommender system displays the nearest movies in the vector form. For this process we use BagOfWords
vectorization to convert words into vector. There are other advanced processes to convert text to vectors, if you want you can use that also.

For the tags we need to exclude the stop words such as (in, the, a, ..., etc) 

In [34]:
import sklearn
import nltk

As we can see that there are many features that are repeating such as action, actions, accept, accepted....etc 
since the meaning of the word is despite of their verbs. To remove this we use stemming. Stemming is the process of reducing words to their root or base form, typically by removing suffixes and prefixes, to improve text analysis and information retrieval tasks.

In [35]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [36]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [37]:
stem('i am dancing and eating my food')  #exampple of stemming

'i am danc and eat my food'

In [38]:
new_df1['tags'] = new_df1['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df1['tags'] = new_df1['tags'].apply(stem)


In [39]:
new_df1

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."
...,...,...,...
4804,9367,El Mariachi,el mariachi just want to play hi guitar and ca...
4805,72766,Newlyweds,a newlyw couple' honeymoon is upend by the arr...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduc a dedic q..."
4807,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


In [40]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')   #extracting features and removing stop_words

In [41]:
vector = cv.fit_transform(new_df1['tags']).toarray()

In [42]:
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [43]:
features_names = cv.get_feature_names_out()
print(features_names[:100])

['000' '007' '10' '100' '11' '12' '13' '14' '15' '16' '17' '17th' '18'
 '18th' '18thcenturi' '19' '1910' '1920' '1930' '1940' '1944' '1950'
 '1950s' '1960' '1960s' '1970' '1970s' '1971' '1974' '1976' '1980' '1985'
 '1990' '1999' '19th' '19thcenturi' '20' '200' '2003' '2009' '20th' '21st'
 '23' '24' '25' '30' '300' '3d' '40' '50' '500' '60' '70' '80' 'aaron'
 'aaroneckhart' 'abandon' 'abduct' 'abigailbreslin' 'abil' 'abl' 'aboard'
 'abov' 'abus' 'academ' 'academi' 'accept' 'access' 'accid' 'accident'
 'acclaim' 'accompani' 'accomplish' 'account' 'accus' 'ace' 'achiev'
 'acquaint' 'act' 'action' 'actionhero' 'activ' 'activist' 'activities'
 'actor' 'actress' 'actual' 'ad' 'adam' 'adamsandl' 'adamshankman' 'adapt'
 'add' 'addict' 'adjust' 'admir' 'admit' 'adolesc' 'adopt' 'ador']


Now we have done all the data preprocessing taks, the remaining task is to initialize the vector and to map distances between each of the vector. For this purpose we use cosine distance rather than euclidian distance because we have large amount of data i.e. approx 5000 and euclidian distance tends to malfunction in large amount of data.

In [44]:
from sklearn.metrics.pairwise import  cosine_similarity

In [45]:
cosine_similarity(vector)

array([[1.        , 0.        , 0.03184649, ..., 0.02475369, 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.02592379, 0.        ,
        0.0277137 ],
       [0.03184649, 0.        , 1.        , ..., 0.02680281, 0.        ,
        0.        ],
       ...,
       [0.02475369, 0.02592379, 0.02680281, ..., 1.        , 0.0412393 ,
        0.04454354],
       [0.        , 0.        , 0.        , ..., 0.0412393 , 1.        ,
        0.08817334],
       [0.        , 0.0277137 , 0.        , ..., 0.04454354, 0.08817334,
        1.        ]])

In [46]:
similarity = cosine_similarity(vector)

In [47]:
similarity.shape

(4806, 4806)

In [48]:
def recommend(movie):       #function to recommend the movies
    movie_index = new_df1[new_df1['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse = True, key= lambda x:x[1])[1:6]
    #this above code is for sorting the vector values in descending order such that the most closest vector values appears 
    #at the top of list and hence, we display the firt 5 values which is the recommended movie by the module

    for i in movies_list:
        print(new_df1.iloc[i[0]].title)


In [56]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Battle: Los Angeles
Independence Day
Falcon Rising


In [50]:
import pickle

In [51]:
# pickle.dump(new_df1.to_dict(),open('movies_dict.pkl','wb'))  #importing df to pkl file to show in streamlit frontend

In [52]:
# pickle.dump(similarity,open('similarity.pkl','wb'))