In [2]:
import pandas as pd
import ast

# Load data
movies = pd.read_csv(r"C:\Users\Global\Documents\MoviesData\tmdb_5000_movies.csv\tmdb_5000_movies.csv")
credits = pd.read_csv(r"C:\Users\Global\Documents\MoviesData\tmdb_5000_credits.csv\tmdb_5000_credits.csv")

# Merge datasets on title
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Handle NaN in overview
movies['overview'] = movies['overview'].fillna('')
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Parse JSON-like fields safely
def extract_names(obj):
    try:
        return [i['name'] for i in ast.literal_eval(obj)]
    except:
        return []

def get_director(obj):
    try:
        for i in ast.literal_eval(obj):
            if i['job'] == 'Director':
                return [i['name']]
        return []
    except:
        return []

def get_cast(obj):
    try:
        return [i['name'] for i in ast.literal_eval(obj)][:3]
    except:
        return []

# Apply parsing
movies['genres'] = movies['genres'].apply(extract_names)
movies['keywords'] = movies['keywords'].apply(extract_names)
movies['cast'] = movies['cast'].apply(get_cast)
movies['director'] = movies['crew'].apply(get_director)

# Combine all into one tag column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['director']
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x).lower())

# Final dataframe
new_df = movies[['movie_id', 'title', 'tags']]


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Limit the number of features (words) and remove English stop words
cv = CountVectorizer(max_features=5000, stop_words='english')

vectors = cv.fit_transform(new_df['tags']).toarray()


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)


In [5]:
def recommend(movie):
    movie = movie.lower()
    if movie not in new_df['title'].str.lower().values:
        print("Movie not found in database.")
        return
    
    index = new_df[new_df['title'].str.lower() == movie].index[0]
    distances = list(enumerate(similarity[index]))
    movies_list = sorted(distances, key=lambda x: x[1], reverse=True)[1:6]  # skip itself

    print(f"Top 5 movies similar to '{new_df.iloc[index].title}':\n")
    for i in movies_list:
        print(new_df.iloc[i[0]].title)


In [6]:
recommend("Avatar")

Top 5 movies similar to 'Avatar':

Aliens
Moonraker
Alien
Alien³
Silent Running


In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    return " ".join([ps.stem(word) for word in text.split()])

new_df.loc[:, 'tags'] = new_df['tags'].apply(stem)



In [None]:
import pickle

pickle.dump(new_df, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))


In [None]:
import nltk


In [None]:
def recommend(movie):
    movie = movie.lower()
    if movie not in new_df['title'].str.lower().values:
        return []

    index = new_df[new_df['title'].str.lower() == movie].index[0]
    distances = cosine_similarity([vectors[index]], vectors).flatten()
    movie_list = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:6]

    return [new_df.iloc[i[0]].title for i in movie_list]


In [None]:
recommend('Inception')


In [None]:
import pickle

# Assuming your processed dataframe is called new_df
new_df.to_pickle('movies.pkl')

# Assuming your similarity matrix (numpy array or whatever) is called similarity
pickle.dump(similarity, open('similarity.pkl', 'wb'))
