#### Importing Useful Libraries

In [1]:
import numpy as np
import pandas as pd
import ast

#### Loading and Cleaning our Datasets

In [2]:
credits_df = pd.read_csv("credits.csv")
movies_df = pd.read_csv("movies.csv")

# Setting the print option to print the full dataset
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Merging the Movies & Credits dataframe on the title column
movies_df = movies_df.merge(credits_df, on='title')
movies_df = movies_df[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Addressing Null or missing values
movies_df.isnull().sum() # returns number of missing values in each columns
movies_df = movies_df.dropna() # dropping those missing entries
movies_df.duplicated().sum() # checking for duplicates
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4805 entries, 0 to 4807
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4805 non-null   int64 
 1   title     4805 non-null   object
 2   overview  4805 non-null   object
 3   genres    4805 non-null   object
 4   keywords  4805 non-null   object
 5   cast      4805 non-null   object
 6   crew      4805 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.3+ KB


In [3]:
movies_df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [4]:
# Getting a specific row
movies_df.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

#### Cleaning up the Data

In [5]:
def convert(obj):
  '''Converts string literal to valid python object'''
  return [i['name'] for i in ast.literal_eval(obj)]

def convert3(obj):
  '''Converts string literal to valid python object 
    and returns only the first 3 items'''
  result = []
  counter = 0
  for i in ast.literal_eval(obj):
    if counter != 3:
      result.append(i['name'])
      counter += 1
    else:
      break
  return result

def fetch_director(obj):
  '''Fetches the directors name from the list of crew'''
  return [i['name']  for i in ast.literal_eval(obj) if i["job"]=="Director"]

movies_df['genres'] = movies_df['genres'].apply(convert)
movies_df['keywords'] = movies_df['keywords'].apply(convert)
movies_df['cast'] = movies_df['cast'].apply(convert3)
movies_df['crew'] = movies_df['crew'].apply(fetch_director)

movies_df['overview'] = movies_df['overview'].apply(lambda x:x.split())
movies_df['genres'] = movies_df['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies_df['cast'] = movies_df['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies_df['crew'] = movies_df['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

# Creating new column with all data
movies_df['tags'] = movies_df['overview']+movies_df['genres']+movies_df['keywords']+movies_df['cast']+movies_df['crew']
new_df = movies_df[['movie_id','title','tags']] # creating new data frame with necessary values
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x)) # changing from list back to string
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())
new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


#### Stemming the Data

In [6]:
import nltk
from nltk.stem.porter import PorterStemmer 

ps = PorterStemmer()

def stem(text): 
  '''reucing a word into it's stem/ root word'''
  return " ".join([ps.stem(i) for i in text.split()])

new_df['tags'] = new_df['tags'].apply(stem) # stemming our dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


#### Performing Text Embeddings using the CountVectorizer & Similarity

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
# CountVectorizer creates a dictionary that maps each token to a position in the output metrics
cv = CountVectorizer(max_features=5000, stop_words='english') # max # of words in each sentence (each feature)
vectors = cv.fit_transform(new_df['tags']).toarray()
# print(cv.get_feature_names())

# cosine similarity uses the cosine to get the similarity of two vectors
similarity = cosine_similarity(vectors) # returns a square matrix comparing 
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x:x[1])[0:3] # printing top 3 similarity with respect to movie[0]

[(0, 1.0000000000000002),
 (1216, 0.28676966733820225),
 (2409, 0.26901379342448517)]

In [8]:
def recomend(movie, n=11):
  '''prints the top n movies related to the Movie chosen'''
  movie_index = new_df[new_df['title']==movie].index[0]
  similarity_row = similarity[movie_index]
  movie_list = sorted(list(enumerate(similarity_row)), reverse=True, key=lambda x:x[1])[1:n]
  for i in movie_list:
    print(new_df['title'][i[0]])


In [9]:
chosen_movie = input("Enter Movie Name: ")
recomend(chosen_movie)

Enter Movie Name: The Dark Knight
The Dark Knight Rises
Batman Begins
Batman Returns
Batman Forever
Batman
Batman
Batman & Robin
Lesbian Vampire Killers
Batman v Superman: Dawn of Justice
The Incredibly True Adventure of Two Girls In Love
