!kaggle datasets list -s "MPST: Movie Plot Synopses with Tags"

!kaggle datasets download -d "cryptexcode/mpst-movie-plot-synopses-with-tags"

from zipfile import ZipFile
with ZipFile("mpst-movie-plot-synopses-with-tags.zip",'r') as zipfile:
    zipfile.extractall("data")

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


.\movie-recommender.ipynb
.\mpst-movie-plot-synopses-with-tags.zip
.\.idea\.gitignore
.\.idea\content based recommender.iml
.\.idea\misc.xml
.\.idea\modules.xml
.\.idea\workspace.xml
.\.idea\inspectionProfiles\profiles_settings.xml
.\.idea\inspectionProfiles\Project_Default.xml
.\.ipynb_checkpoints\movie-recommender-checkpoint.ipynb
.\data\mpst_full_data.csv
.\data\partition.json


In [2]:
data = pd.read_csv('data/mpst_full_data.csv')

In [3]:
data = data.drop_duplicates(subset='title').reset_index(drop=True)

In [4]:
print(data.shape,'\n')
print(data.head(),'\n\n')
print(data.describe())

(13757, 6) 

     imdb_id                                          title  \
0  tt0057603                        I tre volti della paura   
1  tt1733125  Dungeons & Dragons: The Book of Vile Darkness   
2  tt0033045                     The Shop Around the Corner   
3  tt0113862                             Mr. Holland's Opus   
4  tt0086250                                       Scarface   

                                       plot_synopsis  \
0  Note: this synopsis is for the orginal Italian...   
1  Two thousand years ago, Nhagruul the Foul, a s...   
2  Matuschek's, a gift store in Budapest, is the ...   
3  Glenn Holland, not a morning person by anyone'...   
4  In May 1980, a Cuban man named Tony Montana (A...   

                                                tags  split synopsis_source  
0          cult, horror, gothic, murder, atmospheric  train            imdb  
1                                           violence  train            imdb  
2                                    

In [5]:
ind_dict = dict(zip(data['title'],data.index))

In [6]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


import nltk
nltk.download("stopwords")
nltk.download('punkt')

In [7]:
class CustomTokenizer:
    def __init__(self):
        self.stemmer = PorterStemmer()
    def __call__(self, doc):
        return [self.stemmer.stem(token) for token in word_tokenize(doc) if token.isalpha()]
custom_tokenizer = CustomTokenizer()
stopwords = custom_tokenizer(' '.join(stopwords.words('english')))

In [8]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
vect = TfidfVectorizer(stop_words=stopwords,max_features=500,token_pattern=r'\b[A-Za-z][A-Za-z]+\b', tokenizer=custom_tokenizer)
bag = vect.fit_transform(data['plot_synopsis']).toarray()
movie_df = pd.DataFrame(bag,columns=vect.get_feature_names_out())



In [10]:
lk = linear_kernel(bag,bag)

In [27]:
def recommend_movie(movie_name):
    idx = ind_dict[movie_name]
    tags = data.loc[idx,'tags'].split(', ')
    lk_w_idx = list(enumerate(lk[idx]))
    found = [tupl[0] for tupl in sorted(lk_w_idx,reverse=True, key=lambda x:x[1])]
    movies  = data.loc[found,['title']][1:100]
    movies['score'] = range(99,0,-1)
    for idx in movies.index.to_list():
        movie_tags = data.loc[idx, 'tags'].split(", ")
        for tag in movie_tags:
            if tag in tags:
                movies.loc[idx,'score'] += 10
            else:
                movies.loc[idx,'score'] -= 2
        return movies.sort_values(by='score',ascending=False)[:10]

In [43]:
recommend_movie('The Shawshank Redemption')

Unnamed: 0,title,score
2135,Hoodwinked!,117
8739,Raw! Raw! Rooster!,98
8593,Dick Figures: The Movie,97
2633,Batman: Under the Red Hood,96
191,A Perfect World,95
8427,Boiling Point,94
8518,3-4 x jûgatsu,93
9062,Cheyenne Wildcat,92
6320,Little Red Riding Rabbit,91
4321,The Hunt for Red October,90


In [28]:
recommend_movie('The Godfather')

Unnamed: 0,title,score
2482,The Godfather: Part II,123
13677,The Godfather II,98
1999,The Godfather: Part III,97
10986,Grand Theft Auto V,96
2305,Halloween,95
1641,Arlington Road,94
12488,The Lady from Shanghai,93
595,Fireflies in the Garden,92
1786,Tootsie,91
3372,Halloween II,90


In [29]:
recommend_movie('The Dark Knight')

Unnamed: 0,title,score
4695,The Dark Knight Rises,193
5684,Machete,98
329,Batman Begins,97
4230,The Big Lebowski,96
1305,Licence to Kill,95
1855,The International,94
5217,Point of No Return,93
1491,Daredevil,92
4436,From Paris with Love,91
137,The Bourne Supremacy,90


In [32]:
recommend_movie('12 Angry Men')

Unnamed: 0,title,score
5322,Changeling,99
1857,The Maze Runner,98
8594,Yaadon Ki Baaraat,97
5223,Inserts,96
4078,Primal Fear,95
10480,Amusement,94
1551,Fatal Vision,93
9479,Where the River Runs Black,92
7502,Seduced by Madness: The Diane Borchardt Story,91
8896,Frenzy,90


In [33]:
recommend_movie('Schindler\'s List')

Unnamed: 0,title,score
3236,Hart's War,107
2712,The Hallelujah Trail,98
4072,The Killing Fields,97
1597,Zulu Dawn,96
2374,Mulan,95
8989,Fire Emblem: Radiant Dawn,94
9231,La tregua,93
157,Nine to Five,92
739,Buck Privates,91
7124,Shoah,90


In [34]:
recommend_movie('The Lord of the Rings: The Return of the King')

Unnamed: 0,title,score
306,The Lord of the Rings: The Two Towers,205
8501,Transformers: The Game,98
3059,Tron,97
5278,Transformers,96
1810,Moon,95
5886,Buccaneer Bunny,94
2000,Transformers: Dark of the Moon,93
4054,If Only,92
13387,Horse Hare,91
13570,Burn Notice: The Fall of Sam Axe,90


In [35]:
recommend_movie('Pulp Fiction')

Unnamed: 0,title,score
4230,The Big Lebowski,193
5217,Point of No Return,98
351,Trance,97
4436,From Paris with Love,96
5706,Flickan som lekte med elden,95
3561,The Raid 2: Berandal,94
11559,Brigada,93
2138,Per qualche dollaro in più,92
2944,Mulholland Dr.,91
4542,The Dark Knight,90


In [39]:
recommend_movie('Forrest Gump')

Unnamed: 0,title,score
9238,A Shine of Rainbows,109
2737,The Boat That Rocked,98
4848,Piranha 3D,97
1696,Underground,96
857,Hong Xi Guan,95
12024,The Crater Lake Monster,94
1001,Mechanic: Resurrection,93
5035,Jaws,92
5039,Act of Valor,91
8303,Nicholas Nickleby,90


In [40]:
recommend_movie('Fight Club')

Unnamed: 0,title,score
4542,The Dark Knight,193
4832,The Adjustment Bureau,98
329,Batman Begins,97
5684,Machete,96
4147,Watchmen,95
1743,The Karate Kid Part III,94
812,Joy,93
3004,Derailed,92
5217,Point of No Return,91
3287,American Hustle,90


In [41]:
recommend_movie('Inception')

Unnamed: 0,title,score
3692,Enchanted,101
13503,A Life Less Ordinary,98
10448,Curse of the Puppet Master,97
2774,The Omen,96
9488,Papurika,95
13220,Paprika,94
4884,I Am Legend,93
3805,Chi sei?,92
4155,The Incredibles,91
8489,The Sleeping Car,90
