#Data preprocessing 

In [18]:
# ! pip install jaro-winkler
# ! pip install langdetect

In [2]:
#Imports

import string
import pandas as pd
import numpy as np
from langdetect import detect as detect_lang

import warnings
warnings.filterwarnings("ignore")

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max_colwidth', 500)
import jaro

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from google.colab import drive
drive.mount('/content/gdrive/')

%cd '/content/gdrive/MyDrive/ML_projects'

Mounted at /content/gdrive/
/content/gdrive/MyDrive/ML_projects


In [4]:
#Let's read the Movies Dataset from Kaggle and choose three columns with text data.

df_meta = pd.read_csv('/content/gdrive/MyDrive/ML_projects/movies_metadata.csv')
df_content = df_meta[['title', 'imdb_id', 'tagline', 'overview']]
df_content.head(10)

Unnamed: 0,title,imdb_id,tagline,overview
0,Toy Story,tt0114709,,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."
1,Jumanji,tt0113497,Roll the dice and unleash the excitement!,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures."
2,Grumpier Old Men,tt0113228,Still Yelling. Still Fighting. Still Ready for Love.,"A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max."
3,Waiting to Exhale,tt0114885,Friends are the people who let you be yourself... and never let you forget it.,"Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive ""good man"" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, Glo and Robin talk it all out, determined to find a better way to breathe."
4,Father of the Bride Part II,tt0113041,Just When His World Is Back To Normal... He's In For The Surprise Of His Life!,"Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own."
5,Heat,tt0113277,A Los Angeles Crime Saga,"Obsessive master thief, Neil McCauley leads a top-notch crew on various insane heists throughout Los Angeles while a mentally unstable detective, Vincent Hanna pursues him without rest. Each man recognizes and respects the ability and the dedication of the other even though they are aware their cat-and-mouse game may end in violence."
6,Sabrina,tt0114319,You are cordially invited to the most surprising merger of the year.,"An ugly duckling having undergone a remarkable change, still harbors feelings for her crush: a carefree playboy, but not before his business-focused brother has something to say about it."
7,Tom and Huck,tt0112302,The Original Bad Boys.,"A mischievous young boy, Tom Sawyer, witnesses a murder by the deadly Injun Joe. Tom becomes friends with Huckleberry Finn, a boy with no future and no family. Tom has to choose between honoring a friendship or honoring an oath because the town alcoholic is accused of the murder. Tom and Huck go through several adventures trying to retrieve evidence."
8,Sudden Death,tt0114576,Terror goes into overtime.,"International action superstar Jean Claude Van Damme teams with Powers Boothe in a Tension-packed, suspense thriller, set against the back-drop of a Stanley Cup game.Van Damme portrays a father whose daughter is suddenly taken during a championship hockey game. With the captors demanding a billion dollars by game's end, Van Damme frantically sets a plan in motion to rescue his daughter and abort an impending explosion before the final buzzer..."
9,GoldenEye,tt0113189,No limits. No fears. No substitutes.,James Bond must unmask the mysterious head of the Janus Syndicate and prevent the leader from utilizing the GoldenEye weapons system to inflict devastating revenge on Britain.


In [5]:
#Let's check the missing values. 

df_content.isnull().sum()

title           6
imdb_id        17
tagline     25054
overview      954
dtype: int64

In [6]:
#Let's get rid of the missing values.

filtered_df = df_content[df_content[['overview', 'title', 'imdb_id']].notna().all(1)]
filtered_df.loc[:, 'tagline'].replace(np.nan, '', inplace = True)
filtered_df.isnull().sum()

title       0
imdb_id     0
tagline     0
overview    0
dtype: int64

In [7]:
#Also, let's drop some duplicates.

print(filtered_df.shape)
filtered_df.drop_duplicates(inplace=True)
print(filtered_df.shape)

(44491, 4)
(44461, 4)


In [8]:
#Let's drop all the movies whose overviews are not in English.

def detect_lang_with_excep(input):
    try:
        return detect_lang(input)
    except:
        return "none"

filtered_df['lang'] = filtered_df['overview'].apply(detect_lang_with_excep)
print(filtered_df['lang'].value_counts())

en      44257
fr         44
it         18
nl         15
sv         13
es         13
pl         12
af         10
ru         10
de          9
tr          9
fi          7
da          7
pt          6
none        6
ro          5
cs          4
ca          3
no          3
cy          2
tl          2
el          2
hu          1
sl          1
so          1
sk          1
Name: lang, dtype: int64


In [9]:
print(filtered_df.shape)
filtered_df.drop(filtered_df[filtered_df.lang != 'en'].index, inplace=True)
print(filtered_df.shape)

(44461, 5)
(44257, 5)


In [10]:
#Now let's create a final dataset with three columns ('title', 'imdb_id',  and 'text').

filtered_df['text'] =  filtered_df['tagline'] + ' ' + filtered_df['overview']
df_final = pd.concat([filtered_df[['title', 'imdb_id']], filtered_df['text']], axis = 1)
df_final.reset_index(inplace = True, drop = True)
df_final

Unnamed: 0,title,imdb_id,text
0,Toy Story,tt0114709,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."
1,Jumanji,tt0113497,"Roll the dice and unleash the excitement! When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures."
2,Grumpier Old Men,tt0113228,"Still Yelling. Still Fighting. Still Ready for Love. A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max."
3,Waiting to Exhale,tt0114885,"Friends are the people who let you be yourself... and never let you forget it. Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive ""good man"" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, Glo and Robin talk it all out, determined to find a better way to breathe."
4,Father of the Bride Part II,tt0113041,"Just When His World Is Back To Normal... He's In For The Surprise Of His Life! Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own."
...,...,...,...
44252,Subdue,tt6209470,Rising and falling between a man and woman Rising and falling between a man and woman.
44253,Century of Birthing,tt2028550,An artist struggles to finish his work while a storyline about a cult plays in his head.
44254,Betrayal,tt0303758,"A deadly game of wits. When one of her hits goes wrong, a professional assassin ends up with a suitcase full of a million dollars belonging to a mob boss ..."
44255,Satan Triumphant,tt0008536,"In a small town live two brothers, one a minister and the other one a hunchback painter of the chapel who lives with his wife. One dreadful and stormy night, a stranger knocks at the door asking for shelter. The stranger talks about all the good things of the earthly life the minister is missing because of his puritanical faith. The minister comes to accept the stranger's viewpoint but it is others who will pay the consequences because the minister will discover the human pleasures thanks t..."


#Text preprocessing and vectorization

In [11]:
#Let's make some text preprocessing.

def text_preprocessing(text_data):
    tknzd_text = word_tokenize(text_data.lower())
    wordnet_lemmatizer = WordNetLemmatizer()
    lemma_text = [wordnet_lemmatizer.lemmatize(word) for word in tknzd_text]
    
    no_punct = [i for i in lemma_text if i not in list(string.punctuation)]
    stop_words = stopwords.words('english')
    final = [i for i in no_punct if i not in stop_words]
    final = ' '.join(final)
    return final

df_final['text'] = df_final['text'].apply(lambda x: text_preprocessing(x))
df_final

Unnamed: 0,title,imdb_id,text
0,Toy Story,tt0114709,led woody andy 's toy live happily room andy 's birthday brings buzz lightyear onto scene afraid losing place andy 's heart woody plot buzz circumstance separate buzz woody owner duo eventually learns put aside difference
1,Jumanji,tt0113497,roll dice unleash excitement sibling judy peter discover enchanted board game open door magical world unwittingly invite alan -- adult 's trapped inside game 26 year -- living room alan 's hope freedom finish game prof risky three find running giant rhinoceros evil monkey terrifying creature
2,Grumpier Old Men,tt0113228,still yelling still fighting still ready love family wedding reignites ancient feud next-door neighbor fishing buddy john max meanwhile sultry italian divorcée open restaurant local bait shop alarming local worry 'll scare fish away 's le interested seafood cooking hot time max
3,Waiting to Exhale,tt0114885,friend people let ... never let forget cheated mistreated stepped woman holding breath waiting elusive `` good man '' break string less-than-stellar lover friend confidant vannah bernie glo robin talk determined find better way breathe
4,Father of the Bride Part II,tt0113041,world back normal ... 's surprise life george bank ha recovered daughter 's wedding receives news 's pregnant ... george 's wife nina expecting wa planning selling home 's plan -- like george -- change arrival grandchild kid
...,...,...,...
44252,Subdue,tt6209470,rising falling man woman rising falling man woman
44253,Century of Birthing,tt2028550,artist struggle finish work storyline cult play head
44254,Betrayal,tt0303758,deadly game wit one hit go wrong professional assassin end suitcase full million dollar belonging mob bos ...
44255,Satan Triumphant,tt0008536,small town live two brother one minister one hunchback painter chapel life wife one dreadful stormy night stranger knock door asking shelter stranger talk good thing earthly life minister missing puritanical faith minister come accept stranger 's viewpoint others pay consequence minister discover human pleasure thanks ehem sister- -law… tormented minister cuckolded brother die strange accident chapel later infant born minister 's adulterous relationship


In [12]:
#Now we need to vectorize our text data and find out cosine similarity.

count_vect = CountVectorizer()
films_vect = count_vect.fit_transform(df_final['text'])
cosine_sim = cosine_similarity(films_vect, films_vect)

#Getting recommendations

In [13]:
#Let's create a recommender function.

def get_recommendations(title, cosine_sim, dataset):
    idx = dataset.index[dataset['title']== title][0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    recom_dict = {}
    for m in movie_indices:
       recom_dict.update({dataset['title'].iloc[m]: dataset['imdb_id'].iloc[m]})
    return recom_dict

In [16]:
#Now let's make a dataset with recommendations based on cosine similarity.

recom_dataset = pd.concat([df_final['title'], df_final['title'].apply(lambda x: get_recommendations(x, cosine_sim, df_final))], axis = 1)
recom_dataset.columns.values[1] = "recommendations"
recom_dataset.head(10)

Unnamed: 0,title,recommendations
0,Toy Story,"{'Toy Story 2': 'tt0120363', 'Toy Story 3': 'tt0435761', 'The 40 Year Old Virgin': 'tt0405422', 'The Champ': 'tt0021730', 'Small Fry': 'tt2033372', 'Andy Hardy's Blonde Trouble': 'tt0036602', 'Andy Kaufman Plays Carnegie Hall': 'tt0254091', 'Man on the Moon': 'tt0125664', 'Superstar: The Life and Times of Andy Warhol': 'tt0103008', 'Andy Peters: Exclamation Mark Question Point': 'tt4604154'}"
1,Jumanji,"{'Table No. 21': 'tt2229842', 'Indie Game: The Movie': 'tt1942884', 'Liar Game: Reborn': 'tt2078667', 'Beta Test': 'tt4244162', 'Word Wars': 'tt0390632', 'Pixels': 'tt2120120', 'Brainscan': 'tt0109327', 'How to Make a Monster': 'tt0281919', 'Stay Alive': 'tt0441796', 'The Ouija Exorcism': 'tt4859032'}"
2,Grumpier Old Men,"{'Still Mine': 'tt2073086', 'Party Girl': 'tt3660370', 'RED': 'tt1245526', 'Mommy': 'tt3612616', 'The Flying Man': 'tt3018700', 'Austin Powers in Goldmember': 'tt0295178', 'Madagascar: Escape 2 Africa': 'tt0479952', 'Tap': 'tt0098442', 'Innocence': 'tt0251141', 'High Society': 'tt0049314'}"
3,Waiting to Exhale,"{'Pet': 'tt1183374', 'The Divorce of Lady X': 'tt0030063', 'A Cry in the Night': 'tt0049110', 'Good Luck Chuck': 'tt0452625', 'Kalamity': 'tt1109594', 'Gods of the Plague': 'tt0065808', 'Between Us': 'tt1109574', 'Let's Talk About Sex': 'tt0165857', 'On tour': 'tt0100699', 'Woman of the Lake': 'tt0060787'}"
4,Father of the Bride Part II,"{'Father of the Bride': 'tt0101862', 'Babbitt': 'tt0024851', 'I Start Counting': 'tt0064462', 'It's a Wonderful Life': 'tt0038650', 'You're Killing Me': 'tt4490654', 'Visioneers': 'tt0833557', 'Phenomenon II': 'tt0313116', 'Hedda Gabler': 'tt0057135', 'North to Alaska': 'tt0054127', 'A One-Way Trip to Antibes': 'tt2023473'}"
5,Heat,"{'Mulholland Falls': 'tt0117107', 'The Lodger': 'tt0851530', 'Sunset Strip': 'tt1542485', 'All Things To All Men': 'tt2095568', 'Street Knight': 'tt0108234', 'Some Girl': 'tt0125766', 'Out of Time': 'tt0095802', 'The Decline of Western Civilization Part III': 'tt0138393', 'God Respects Us When We Work, but Loves Us When We Dance': 'tt0063016', '10.0 Earthquake': 'tt3488056'}"
6,Sabrina,"{'Snow Beast': 'tt1623765', 'Every Other Week': 'tt0483195', 'Dark Blue Almost Black': 'tt0452971', 'Madalena': 'tt0055119', 'Parents': 'tt0997062', '1920': 'tt1301698', 'Carnival Night': 'tt0049397', 'The Private Life of Don Juan': 'tt0025681', 'Whatever': 'tt0140688', 'Miss Kicki': 'tt1322346'}"
7,Tom and Huck,"{'Tom Sawyer & Huckleberry Finn': 'tt1977087', 'Tom Sawyer': 'tt0070814', 'The Mouse Comes to Dinner': 'tt0037928', 'Treacle Jr.': 'tt1542486', 'Striking Distance': 'tt0108238', 'The Love of Siam': 'tt1152282', 'Massacre Time': 'tt0061074', 'Hard Choices': 'tt0091170', 'Mystery Date': 'tt0102500', 'The Pallbearer': 'tt0117283'}"
8,Sudden Death,"{'Liar Game: Reborn': 'tt2078667', 'Table No. 21': 'tt2229842', 'Indie Game: The Movie': 'tt1942884', 'The Indecent Woman': 'tt0102597', 'Quiz': 'tt1891892', 'Wake of Death': 'tt0367478', 'Pixels': 'tt2120120', 'JCVD': 'tt1130988', 'How to Make a Monster': 'tt0281919', 'Paintball': 'tt1205071'}"
9,GoldenEye,"{'Live and Let Die': 'tt0070328', 'Casino Royale': 'tt0061452', 'Never Say Never Again': 'tt0086006', 'Octopussy': 'tt0086034', 'Licence to Kill': 'tt0097742', 'A View to a Kill': 'tt0090264', 'Bunraku': 'tt1181795', 'Dr. No': 'tt0055928', 'One Man's Justice': 'tt0113999', 'The Hyperboloid of Engineer Garin': 'tt0314105'}"


In [17]:
#Finally, let's save this dataset into .csv file.

recom_dataset.to_csv('movie_recommendations.csv')