<a href="https://colab.research.google.com/github/aiswaryalakshmi323/movie-recommendation/blob/main/official_content_based_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
 import pandas as pd
import numpy as np
import ast
import pickle
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
tmdb_movies = pd.read_csv("/content/drive/MyDrive/Colab Data/tmdb_5000_movies.csv")
tmdb_credits = pd.read_csv("/content/drive/MyDrive/Colab Data/tmdb_5000_credits.csv")
bollywood = pd.read_csv("/content/drive/MyDrive/Colab Data/bollywood_full.csv")

In [12]:
hollywood_df = tmdb_movies.merge(tmdb_credits, left_on='id', right_on='movie_id')
hollywood_df = hollywood_df[["id","title_x","overview","genres","keywords","cast","crew"]]
hollywood_df = hollywood_df.rename(columns={"id":"movie_id", "title_x": "title"})
hollywood_df['origin'] = 'Hollywood'
hollywood_df['poster_url'] = ''

In [13]:
def extract_genres(genre_str):
    try:
        genre_list = ast.literal_eval(genre_str)
        return [genre['name'] for genre in genre_list]
    except (ValueError, SyntaxError): return []

bollywood['genres'] = bollywood['genres'].apply(extract_genres)
bollywood = bollywood.rename(columns={
    "title_x": "title", "story": "overview",
    "actors": "cast", "poster_path": "poster_url"
})

In [14]:
bollywood = bollywood[["title", "overview", "genres", "cast", "poster_url"]]
bollywood.dropna(inplace=True)
bollywood["movie_id"] = range(100000, 100000 + len(bollywood))
bollywood['origin'] = 'Bollywood'
bollywood['crew'] = ''

In [15]:
common_cols = ["movie_id", "title", "overview", "genres", "keywords", "cast", "crew", "poster_url", "origin"]

In [16]:
combined = pd.concat([
    hollywood_df.reindex(columns=common_cols),
    bollywood.reindex(columns=common_cols)
], ignore_index=True)
combined['keywords'].fillna('[]', inplace=True)
combined['crew'].fillna('[]', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined['keywords'].fillna('[]', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined['crew'].fillna('[]', inplace=True)


In [17]:
def convert_json(obj):
    try: return [i["name"] for i in ast.literal_eval(obj)]
    except: return []

def convert_list_or_string(obj):
    if isinstance(obj, list): return obj
    elif isinstance(obj, str): return [name.strip() for name in obj.split(',')]
    return []

def get_top_3(obj):
    return obj[:3]

def get_director(obj): # Only for Hollywood data
    try:
        for i in ast.literal_eval(obj):
            if i["job"] == "Director": return [i["name"]]
    except: return []
    return []

def collapse(L):
    return [i.replace(" ", "").lower() for i in L]


In [18]:
combined['genres'] = combined['genres'].apply(convert_list_or_string)

In [19]:
combined['keywords'] = combined['keywords'].apply(convert_json)

In [20]:
combined['cast'] = combined['cast'].apply(convert_list_or_string)

In [21]:
combined['cast'] = combined['cast'].apply(get_top_3)

In [22]:
combined['crew'] = combined['crew'].apply(get_director)

In [23]:
combined["overview"] = combined["overview"].apply(lambda x: x.split() if isinstance(x, str) else [])

In [24]:
for feature in ['genres', 'keywords', 'cast', 'crew']:
    combined[feature] = combined[feature].apply(collapse)

In [25]:
combined["tags"] = combined["overview"] + combined["genres"] + combined["keywords"] + combined["cast"] + combined["crew"]
combined["tags"] = combined["tags"].apply(lambda x: " ".join(x).lower())

In [26]:
final_df = combined[["movie_id", "title", "tags", "overview", "cast", "crew", "poster_url", "origin"]].copy()

In [27]:
ps = PorterStemmer()

In [28]:
final_df["tags"] = final_df["tags"].apply(lambda x: " ".join([ps.stem(i) for i in x.split()]))
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
vectors = tfidf.fit_transform(final_df["tags"]).toarray()
similarity = cosine_similarity(vectors)

In [29]:
pickle.dump(final_df.to_dict(), open("movie_list.pkl", "wb"))
pickle.dump(similarity, open("similarity.pkl", "wb"))