##  Dataset & Jupyter notebook setup

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing

In [None]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [None]:
movies.head(2)

In [None]:
movies.shape

In [None]:
credits.head()

In [None]:
movies = movies.merge(credits,on='title')

In [None]:
movies.head()

In [None]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [None]:
movies.head()

## Data Preprocessing

In [None]:
import ast

In [None]:
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

In [None]:
movies.dropna(inplace=True)

In [None]:
movies['genres'] = movies['genres'].apply(convert)
movies.head()

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)
movies.head()

In [None]:
import ast
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

In [None]:
def convert3(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter+=1
    return L

In [None]:
movies['cast'] = movies['cast'].apply(convert)
movies.head()

In [None]:
movies['crew'][0]

In [None]:
movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

In [None]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [None]:
movies['overview'][0] # string->  list

In [None]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())  # string->  list
# movies.sample(5)
movies.head()

In [None]:
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [None]:
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

In [None]:
movies.head()

In [None]:
# prompt: Using dataframe movies: a output.csv file with movie_id title and genre

movies[['movie_id', 'title', 'genres']].to_csv('output.csv', index=False)


In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new.head()

In [None]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))  # list to string
new.head()

In [None]:
new['tags'][0]

In [None]:
new['tags'] = new['tags'].apply(lambda x:x.lower())  # list to string
new.head()

In [None]:
new['tags'][0]

In [None]:
new['tags'][1]

## Text Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [None]:
vector = cv.fit_transform(new['tags']).toarray()

In [None]:
vector.shape

In [None]:
vector

In [None]:
vector[0]

In [None]:
# len(cv.get_feature_names_out())
cv.get_feature_names_out()

In [None]:
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
# Eg
['loved','loving','love']
['love','love','love']

In [None]:
# ps.stem('loved')

In [None]:
def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)


In [None]:
stem('in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron')

In [None]:
new['tags'] = new['tags'].apply(stem)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [None]:
vector = cv.fit_transform(new['tags']).toarray()

In [None]:
vector

In [None]:
cv.get_feature_names_out()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity  # 1-> more similarity   0-> less similarity

In [None]:
similarity = cosine_similarity(vector)

In [None]:
similarity.shape

In [None]:
similarity[0]

In [None]:
similarity[1]

In [None]:
similarity

In [None]:
new[new['title'] == 'The Lego Movie'].index[0]

In [None]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)



In [None]:
recommend('Spider-Man')

In [None]:
import pickle

In [None]:
pickle.dump(new,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))