In [95]:
import numpy as np
import pandas as pd
import ast
import nltk
from sklearn.feature_extraction.text import CountVectorizer

In [96]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [97]:
movies = movies.merge(credits, on='title')

In [98]:
# Columns to be dropped
# Budget, homepage, original_language, original_title, popularity, production_companies, production_countries, revenue, runtime, spoken_languages, status, tagline, vote_average, vote_count
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [99]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


##Pre-processing

In [100]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [101]:
movies.dropna(inplace=True)

In [102]:
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [103]:
movies.duplicated().sum()

np.int64(0)

In [104]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [105]:
def extract_lst_names(json_str):
    """
    Extracts genre names from a JSON-like string and returns them as a comma-separated string.
    
    Parameters:
    json_str (str): A JSON-like string representing a list of genres.
    
    Returns:
    str: A comma-separated string of names.
    """
    json_list = ast.literal_eval(json_str)
    lst_names = [lst['name'] for lst in json_list]
    return ', '.join(lst_names)

In [106]:
# Apply the function to the 'genres' column and store the result back in the same column
movies['genres'] = movies['genres'].apply(extract_lst_names)

# Printing the first few rows to verify
print(movies[['genres']].head())

                                        genres
0  Action, Adventure, Fantasy, Science Fiction
1                   Adventure, Fantasy, Action
2                     Action, Adventure, Crime
3               Action, Crime, Drama, Thriller
4           Action, Adventure, Science Fiction


In [107]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","Action, Adventure, Fantasy, Science Fiction","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","Adventure, Fantasy, Action","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"Action, Adventure, Crime","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"Action, Crime, Drama, Thriller","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","Action, Adventure, Science Fiction","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [108]:
movies['keywords'] = movies['keywords'].apply(extract_lst_names)

In [109]:
movies.iloc[0].keywords

'culture clash, future, space war, space colony, society, space travel, futuristic, romance, space, alien, tribe, alien planet, cgi, marine, soldier, battle, love affair, anti war, power relations, mind and soul, 3d'

In [110]:
def extract_top3cast_names(json_str):
    """
    Extracts top 3 cast names from a JSON-like string and returns them as a comma-separated string.
    
    Parameters:
    json_str (str): A JSON-like string representing a list of genres.
    
    Returns:
    str: A comma-separated string of cast names.
    """
    cast_list = ast.literal_eval(json_str)
    top3_cast_names = [cast['name'] for cast in cast_list[:3]]
    return ', '.join(top3_cast_names)

In [111]:
movies['cast'] = movies['cast'].apply(extract_top3cast_names)

In [112]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","Action, Adventure, Fantasy, Science Fiction","culture clash, future, space war, space colony...","Sam Worthington, Zoe Saldana, Sigourney Weaver","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","Adventure, Fantasy, Action","ocean, drug abuse, exotic island, east india t...","Johnny Depp, Orlando Bloom, Keira Knightley","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"Action, Adventure, Crime","spy, based on novel, secret agent, sequel, mi6...","Daniel Craig, Christoph Waltz, Léa Seydoux","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"Action, Crime, Drama, Thriller","dc comics, crime fighter, terrorist, secret id...","Christian Bale, Michael Caine, Gary Oldman","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","Action, Adventure, Science Fiction","based on novel, mars, medallion, space travel,...","Taylor Kitsch, Lynn Collins, Samantha Morton","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [113]:
def extract_director_name(crew_str):
    """
    Extracts the director's name from a JSON-like string representing the crew.
    
    Parameters:
    crew_str (str): A JSON-like string representing a list of crew members.
    
    Returns:
    str: The name of the director, or an empty string if no director is found.
    """
    crew_list = ast.literal_eval(crew_str)
    for crew_member in crew_list:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return ''

In [114]:
movies['crew'] = movies['crew'].apply(extract_director_name)

In [115]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","Action, Adventure, Fantasy, Science Fiction","culture clash, future, space war, space colony...","Sam Worthington, Zoe Saldana, Sigourney Weaver",James Cameron
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","Adventure, Fantasy, Action","ocean, drug abuse, exotic island, east india t...","Johnny Depp, Orlando Bloom, Keira Knightley",Gore Verbinski
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"Action, Adventure, Crime","spy, based on novel, secret agent, sequel, mi6...","Daniel Craig, Christoph Waltz, Léa Seydoux",Sam Mendes
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"Action, Crime, Drama, Thriller","dc comics, crime fighter, terrorist, secret id...","Christian Bale, Michael Caine, Gary Oldman",Christopher Nolan
4,49529,John Carter,"John Carter is a war-weary, former military ca...","Action, Adventure, Science Fiction","based on novel, mars, medallion, space travel,...","Taylor Kitsch, Lynn Collins, Samantha Morton",Andrew Stanton


##Applying Transformation

In [116]:
movies['genres'] = movies['genres'].str.replace(' ', '')
movies['keywords'] = movies['keywords'].str.replace(' ', '')
movies['cast'] = movies['cast'].str.replace(' ', '')
movies['crew'] = movies['crew'].str.replace(' ', '')

In [117]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","Action,Adventure,Fantasy,ScienceFiction","cultureclash,future,spacewar,spacecolony,socie...","SamWorthington,ZoeSaldana,SigourneyWeaver",JamesCameron
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","Adventure,Fantasy,Action","ocean,drugabuse,exoticisland,eastindiatradingc...","JohnnyDepp,OrlandoBloom,KeiraKnightley",GoreVerbinski
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"Action,Adventure,Crime","spy,basedonnovel,secretagent,sequel,mi6,britis...","DanielCraig,ChristophWaltz,LéaSeydoux",SamMendes
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"Action,Crime,Drama,Thriller","dccomics,crimefighter,terrorist,secretidentity...","ChristianBale,MichaelCaine,GaryOldman",ChristopherNolan
4,49529,John Carter,"John Carter is a war-weary, former military ca...","Action,Adventure,ScienceFiction","basedonnovel,mars,medallion,spacetravel,prince...","TaylorKitsch,LynnCollins,SamanthaMorton",AndrewStanton


##Create tag column and concatenate overview,genres,keywords,cast and crew columns in single column

In [118]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [119]:
movies.iloc[0].tags

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.Action,Adventure,Fantasy,ScienceFictioncultureclash,future,spacewar,spacecolony,society,spacetravel,futuristic,romance,space,alien,tribe,alienplanet,cgi,marine,soldier,battle,loveaffair,antiwar,powerrelations,mindandsoul,3dSamWorthington,ZoeSaldana,SigourneyWeaverJamesCameron'

In [120]:
new_df = movies[['movie_id', 'title', 'tags']]

In [121]:
new_df['tags'] = new_df['tags'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].str.lower()


In [122]:
new_df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."
...,...,...,...
4804,9367,El Mariachi,el mariachi just wants to play his guitar and ...
4805,72766,Newlyweds,a newlywed couple's honeymoon is upended by th...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduces a dedic..."
4807,126186,Shanghai Calling,when ambitious new york attorney sam is sent t...


##Text Vectorization using Bag of words

In [123]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [124]:
vectors_movie = cv.fit_transform(new_df['tags']).toarray()

##Applying stemming to get rid of similar words

In [133]:
from nltk import PorterStemmer
ps = PorterStemmer()

In [134]:
def stem_text(text):
    """
    Stems the words in the input text.
    
    Parameters:
    text (str): The input text.
    
    Returns:
    str: The text with words stemmed.
    """
    text = text.split()
    text = [ps.stem(word) for word in text]
    return ' '.join(text)

In [135]:
new_df['tags'] = new_df['tags'].apply(stem_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem_text)


##Now repeat the text vectorization steps again to see the difference

In [136]:
vectors_movie

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [137]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zombies', 'zone', 'zoo'], dtype=object)

In [138]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors_movie)

In [139]:
similarity[0]

array([1.        , 0.06963106, 0.06579517, ..., 0.02594996, 0.02787473,
       0.        ])

In [140]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:6]
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [141]:
recommend('Batman Begins')

The Dark Knight
Batman & Robin
Batman
Batman
The Dark Knight Rises


In [142]:
import pickle

In [143]:
movie_file = open('movies.pkl', 'wb')
pickle.dump(new_df, movie_file)
movie_file.close()

In [144]:
similarity_file = open('similarity.pkl', 'wb')
pickle.dump(similarity, similarity_file)
similarity_file.close()