In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer

import warnings
warnings.filterwarnings('ignore')

In [2]:
movies = pd.read_csv('/content/drive/MyDrive/Data Science/Portfolio Projects/Recommender System - Content Based/top10K-TMDB-movies.csv')

In [3]:
movies.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.3+ KB


In [5]:
# Important Columns are id, title, imdb_id, overview, genres, keywords

usable_columns = ['id','title', 'overview', 'genre']
df = movies[usable_columns]

In [6]:
df.head()

Unnamed: 0,id,title,overview,genre
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama,Crime"
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","Comedy,Drama,Romance"
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama,Crime"
3,424,Schindler's List,The true story of how businessman Oskar Schind...,"Drama,History,War"
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...,"Drama,Crime"


In [7]:
df.isnull().sum()

Unnamed: 0,0
id,0
title,0
overview,13
genre,3


In [8]:
#removing only title feature's null values because other features will get combined and will have some value eventually
df.dropna(subset=['overview','genre'], inplace=True)

In [9]:
df.isnull().sum()

Unnamed: 0,0
id,0
title,0
overview,0
genre,0


In [10]:
df.shape

(9985, 4)

In [51]:
df['tags'] = df.genre + ',' + df.overview # df.overview not combining as system is crashing

In [52]:
# change movie title to lower

def title_lower(obj):
  return obj.strip().lower()

df['movie_title'] = df['title'].apply(title_lower)


In [53]:
df.head()

Unnamed: 0,id,title,overview,genre,tags,movie_title
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama,Crime","Drama,Crime,Framed in the 1940s for the double...",the shawshank redemption
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","Comedy,Drama,Romance","Comedy,Drama,Romance,Raj is a rich, carefree, ...",dilwale dulhania le jayenge
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama,Crime","Drama,Crime,Spanning the years 1945 to 1955, a...",the godfather
3,424,Schindler's List,The true story of how businessman Oskar Schind...,"Drama,History,War","Drama,History,War,The true story of how busine...",schindler's list
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...,"Drama,Crime","Drama,Crime,In the continuing saga of the Corl...",the godfather: part ii


In [13]:
# df['title_lowercase'] = df[df['title']].lower()

In [14]:
porter_stemmer = PorterStemmer()

stemmed_tags = [porter_stemmer.stem(word) for word in df['tags'].values]

In [15]:
# nltk.download('stopwords')
# english_stopwords = stopwords.words('english')

In [16]:
# vectorize without stopwords - TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')

In [18]:
vectorized_tags = vectorizer.fit_transform(stemmed_tags).toarray()

In [19]:
# pca = PCA(n_components = 100)
# vector_tags_reduced = pca.fit_transform(vector_tags)

In [20]:
similarity = cosine_similarity(vectorized_tags)

In [64]:
def recomendation(name):
  name = name.lower()
  recommended_movies_list = []

  if name in df['movie_title'].values:
    movie_index = df[df['movie_title'] == name].index[0]
    recomendation = sorted(list(enumerate(similarity[movie_index])), reverse = True, key = lambda x:x[1])[1:6]

    for i in recomendation:
      recommended_movies_list.append(df.iloc[i[0]].title)
    return recommended_movies_list
  else:
    print('Movie Not Found')


In [67]:
recomendation('batman')

['Batman Returns',
 'The Batman vs. Dracula',
 'Batman Beyond: Return of the Joker',
 'The Dark Knight',
 'Batman Begins']