In [4]:
import numpy as np
import pandas as pd
import ast
import sklearn

In [5]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [6]:
movies.sample()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
942,50000000,"[{""id"": 10749, ""name"": ""Romance""}, {""id"": 16, ...",http://www.bookoflifemovie.com/,228326,"[{""id"": 128, ""name"": ""love triangle""}, {""id"": ...",en,The Book of Life,"The journey of Manolo, a young man who is torn...",34.890999,"[{""name"": ""Twentieth Century Fox Film Corporat...","[{""iso_3166_1"": ""MX"", ""name"": ""Mexico""}, {""iso...",2014-10-01,97437106,95.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,The Book of Life,7.3,755


In [7]:
credits.sample()

Unnamed: 0,movie_id,title,cast,crew
2020,14635,The Rookie,"[{""cast_id"": 1, ""character"": ""Jimmy Morris"", ""...","[{""credit_id"": ""52fe460a9251416c7506b15b"", ""de..."


In [8]:
movies.shape

(4803, 20)

In [9]:
credits.shape

(4803, 4)

In [10]:
movies = movies.merge(credits, on = 'title')
movies.sample()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
2739,13000000,"[{""id"": 35, ""name"": ""Comedy""}]",http://stvincent-movie.com/,239563,"[{""id"": 2604, ""name"": ""babysitter""}, {""id"": 60...",en,St. Vincent,A young boy whose parents just divorced finds ...,43.791745,"[{""name"": ""The Weinstein Company"", ""id"": 308},...",...,102.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Love Thy Neighbor,St. Vincent,7.1,763,239563,"[{""cast_id"": 3, ""character"": ""Vincent"", ""credi...","[{""credit_id"": ""546eafd8c3a3682f9e000ae0"", ""de..."


#### Since we are not doing collaborative based filtering, the popularity, release date and other such numerical columns doesn't fit under our workflow, so we'll skip all such columns and select only useful columns for content-based filtering

In [11]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

Useful columns:
- genres
- keywords
- overview
- title
- movie_id
- cast
- crew

In [12]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [13]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   genres    4809 non-null   object
 4   keywords  4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 263.1+ KB


## Handling missing or duplicate value

In [14]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [15]:
movies.dropna(inplace=True)

In [16]:
movies.duplicated().sum()

0

## Preprocessing

In [17]:
def convert(obj):
    lst = []
    for i in ast.literal_eval(obj):
        lst.append(i['name'])
    return lst

In [18]:
movies['genres'] = movies['genres'].apply(convert)

In [19]:
movies['keywords'] = movies['keywords'].apply(convert)

In [20]:
def convert2(obj):
    lst2 = []
    cnt = 0
    for i in ast.literal_eval(obj):
        if cnt == 3:
            break
        else:
            lst2.append(i['character'])
            cnt+=1
    return lst2

In [21]:
movies['cast'] = movies['cast'].apply(convert2)

In [22]:
def findDirector(obj):
    dirs=[]
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            dirs.append(i['name'])
    return dirs


In [23]:
movies['crew'] = movies['crew'].apply(findDirector)

In [24]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [25]:
movies

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Jake Sully, Neytiri, Dr. Grace Augustine]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain Jack Sparrow, Will Turner, Elizabeth ...",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[James Bond, Blofeld, Madeleine]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Bruce Wayne / Batman, Alfred Pennyworth, Jame...",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[John Carter, Dejah Thoris, Sola]",[Andrew Stanton]
...,...,...,...,...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui...","[Action, Crime, Thriller]","[united states–mexico barrier, legs, arms, pap...","[El Mariachi, Bigotón, Mauricio (Moco)]",[Robert Rodriguez]
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended...","[Comedy, Romance]",[],"[Buzzy, Linda, Marsha]",[Edward Burns]
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,...","[Comedy, Drama, Romance, TV Movie]","[date, love at first sight, narration, investi...","[Oliver O’Toole, Shane McInerney, Rita Haywith]",[Scott Smith]
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is...",[],[],"[Sam, Amanda, Donald]",[Daniel Hsia]


We need to remove whitespaces in between the words.
reason: for e.g. if we proceed ahead to make tags without removing whitespaces then in (say) cast column James word from james Cameron will be considered same as in James Bond, which might lead to incorrect results

In [26]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])

In [27]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[JakeSully, Neytiri, Dr.GraceAugustine]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[CaptainJackSparrow, WillTurner, ElizabethSwann]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[JamesBond, Blofeld, Madeleine]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[BruceWayne/Batman, AlfredPennyworth, JamesGor...",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[JohnCarter, DejahThoris, Sola]",[AndrewStanton]


In [28]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [29]:
print(movies['tags'][0])

['In', 'the', '22nd', 'century,', 'a', 'paraplegic', 'Marine', 'is', 'dispatched', 'to', 'the', 'moon', 'Pandora', 'on', 'a', 'unique', 'mission,', 'but', 'becomes', 'torn', 'between', 'following', 'orders', 'and', 'protecting', 'an', 'alien', 'civilization.', 'Action', 'Adventure', 'Fantasy', 'ScienceFiction', 'cultureclash', 'future', 'spacewar', 'spacecolony', 'society', 'spacetravel', 'futuristic', 'romance', 'space', 'alien', 'tribe', 'alienplanet', 'cgi', 'marine', 'soldier', 'battle', 'loveaffair', 'antiwar', 'powerrelations', 'mindandsoul', '3d', 'JakeSully', 'Neytiri', 'Dr.GraceAugustine', 'JamesCameron']


In [30]:
movies['tags'] = movies['tags'].apply(lambda x:" ".join(x)) #converting list to str

In [31]:
movies['tags'] = movies['tags'].apply(lambda x:x.lower())  #converting string to lowercase

In [32]:
new_df = movies[['movie_id','title','tags']]
new_df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."
...,...,...,...
4804,9367,El Mariachi,el mariachi just wants to play his guitar and ...
4805,72766,Newlyweds,a newlywed couple's honeymoon is upended by th...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduces a dedic..."
4807,126186,Shanghai Calling,when ambitious new york attorney sam is sent t...


## Model Building

Let's use one of the most interpretible and simple mechanism to do vectorization of tags i.e. "Bag of Words"

## Bag of Words (BoW) Algorithm

The Bag of Words (BoW) algorithm is a simple and commonly used method in Natural Language Processing (NLP) to convert text into numerical representations. It is particularly useful for text classification and sentiment analysis tasks. Below is an explanation of how the Bag of Words algorithm works:

### 1. Tokenization
   - The first step in the Bag of Words model is tokenization, where the text data is broken down into individual words or tokens.
   - For example, consider the following two sentences:
     - Sentence 1: "I love programming."
     - Sentence 2: "Programming is fun."

### 2. Stop Words Removal
   - Stop words are common words that appear frequently in the language but carry little meaningful information (e.g., "the," "is," "and," "in").
   - Before creating the vocabulary, these stop words are often removed to focus on the more significant words in the text.
   - For example:
     - Sentence 1 (after stop words removal): "love programming."
     - Sentence 2 (after stop words removal): "Programming fun."

### 3. Vocabulary Creation
   - After stop words removal, the next step is to create a vocabulary. The vocabulary is a set of all unique words (tokens) that appear in the entire corpus (the collection of documents or sentences).
   - From the example sentences, the vocabulary would be:
     ```
     Vocabulary: [love, programming, fun]
     ```

### 4. Vectorization
   - In this step, each document or sentence is represented as a vector. The vector has the same length as the vocabulary, with each position corresponding to a word from the vocabulary.
   - The value at each position in the vector is the frequency of the corresponding word in the document or sentence.

   - For our example sentences:
     - Sentence 1: "love programming."
       - Vector: `[1, 1, 0]`
     - Sentence 2: "Programming fun."
       - Vector: `[0, 1, 1]`

     Here, the vectors represent the frequency of each word from the vocabulary in the respective sentences.

### 5. Feature Matrix
   - When dealing with multiple documents or sentences, a feature matrix is created where each row corresponds to a document or sentence, and each column corresponds to a word from the vocabulary.
   - The feature matrix for the example sentences would look like this:

     |        | love | programming | fun |
     |--------|------|-------------|-----|
     | **S1** |  1   |      1      |  0  |
     | **S2** |  0   |      1      |  1  |

   - This matrix can now be used as input for machine learning algorithms.

### 6. Considerations
   - **Stop Words Removal:** Common words (stop words) are often removed to focus on the more significant words in the text.
   - **Order Ignored:** The Bag of Words model disregards the order of words in a sentence. It only considers word frequency.
   - **High Dimensionality:** The size of the feature vector depends on the size of the vocabulary, which can become large, especially with large corpora.
   - **Sparse Matrix:** Often, the resulting matrix is sparse, meaning that many of the values are zero, especially if the vocabulary is large.


In [33]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [34]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [35]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [36]:
vectors.shape

(4806, 5000)

In [37]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zombies', 'zone', 'zoo'], dtype=object)

One prominent issue in above features are it is considering the words 'Action' and 'Actions' as two distinct meanings, which can lead to problems.
This is where NLP comes into the picture, we'll use stemming technique which when given a list of similar words like ['love','loving','loved'] converts to ['love','love','love'] (root word).

In [38]:
import nltk

In [39]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [40]:
def stem(text):
    stem_list =[]
    for word in text.split():
        stem_list.append(ps.stem(word))
    return " ".join(stem_list)

In [41]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [42]:
new_df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."
...,...,...,...
4804,9367,El Mariachi,el mariachi just want to play hi guitar and ca...
4805,72766,Newlyweds,a newlyw couple' honeymoon is upend by the arr...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduc a dedic q..."
4807,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


Now, we know that in higher dimensions, Wuclidean distance is not a reliable measure, more the dimensions, more it fails. Instead, we'll rely on cosine diatance between two vectors (or simply angle between them). Lesser the cosine distance more the the similarities in the movies' vector.

In [43]:
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(vectors)
sim

array([[1.        , 0.09158438, 0.06003002, ..., 0.02423931, 0.02450715,
        0.        ],
       [0.09158438, 1.        , 0.06780635, ..., 0.02737928, 0.        ,
        0.        ],
       [0.06003002, 0.06780635, 1.        , ..., 0.0269191 , 0.        ,
        0.        ],
       ...,
       [0.02423931, 0.02737928, 0.0269191 , ..., 1.        , 0.06593805,
        0.04847862],
       [0.02450715, 0.        , 0.        , ..., 0.06593805, 1.        ,
        0.04901431],
       [0.        , 0.        , 0.        , ..., 0.04847862, 0.04901431,
        1.        ]])

In [44]:
sim.shape

(4806, 4806)

sim contains similarity score of each movie with each movie in the vectors. Similarity score ranges from [-1,1].

In [45]:
def recommend(movie):
    index = new_df.loc[new_df['title'] == movie].index[0]
    dist = sim[index]
    movie_list = sorted(list(enumerate(dist)),reverse=True,key=lambda x:x[1])[1:6]
    for i in movie_list:
        print(new_df.iloc[i[0]]['title'])
    
    
    

In [46]:
recommend("Batman Begins")

The Dark Knight
The Dark Knight Rises
Batman
Batman
Batman Returns


# It works! 👍

In [47]:
import pickle

In [48]:
# pickle.dump(new_df,open('movies.pkl','wb'))

In [49]:
pickle.dump(sim,open('sim.pkl','wb'))

In [50]:
print("numpy", np.__version__)
print("pandas", pd.__version__)
print("sklearn", sklearn.__version__)
print("nltk", nltk.__version__)

numpy 1.26.2
pandas 2.2.0
sklearn 1.4.1.post1
nltk 3.8.2
