# Imports

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Setup

In [2]:
pd.set_option("max.colwidth", 0)

# Data Sourcing

In [3]:
films = pd.read_csv("data/16k_movies.csv")

In [4]:
films.head()

Unnamed: 0.1,Unnamed: 0,Title,Release Date,Description,Rating,No of Persons Voted,Directed by,Written by,Duration,Genres
0,0,Dekalog (1988),"Mar 22, 1996","This masterwork by Krzysztof Kieślowski is one of the twentieth century’s greatest achievements in visual storytelling. Originally made for Polish television, Dekalog focuses on the residents of a housing complex in late-Communist Poland, whose lives become subtly intertwined as they face emotional dilemmas that are at once deeply personal and universally human. Its ten hour-long films, drawing from the Ten Commandments for thematic inspiration and an overarching structure, grapple deftly with complex moral and existential questions concerning life, death, love, hate, truth, and the passage of time. Shot by nine different cinematographers, with stirring music by Zbigniew Preisner and compelling performances from established and unknown actors alike, Dekalog arrestingly explores the unknowable forces that shape our lives. Also available are the longer theatrical versions of the series’ fifth and sixth films: A Short Film About Killing and A Short Film About Love. [Janus Films]",7.4,118,Krzysztof Kieslowski,"Krzysztof Kieslowski, Krzysztof Piesiewicz",9 h 32 m,Drama
1,1,Three Colors: Red,"Nov 23, 1994","Krzysztof Kieslowski closes his Three Colors trilogy in grand fashion, with an incandescent meditation on fate and chance, starring Irène Jacob as a sweet-souled yet somber runway model in Geneva whose life dramatically intersects with that of a bitter retired judge, played by Jean-Louis Trintignant. Meanwhile, just down the street, a seemingly unrelated story of jealousy and betrayal unfolds. Red is an intimate look at forged connections and a splendid final statement from a remarkable filmmaker at the height of his powers. [Criterion]",8.3,241,Krzysztof Kieslowski,"Krzysztof Kieslowski, Krzysztof Piesiewicz, Agnieszka Holland, Edward Zebrowski, Edward Klosinski, Marcin Latallo",1 h 39 m,"Drama,Mystery,Romance"
2,2,The Conformist,"Oct 22, 1970","Set in Rome in the 1930s, this re-release of Bernardo Bertolucci's 1970 breakthrough feature stars Jean-Louis Trintignant as a Mussolini operative sent to Paris to locate and eliminate an old professor who fled Italy when the fascists came to power.",7.3,106,Bernardo Bertolucci,"Alberto Moravia, Bernardo Bertolucci",1 h 47 m,Drama
3,3,Tokyo Story,"Mar 13, 1972","Yasujiro Ozu’s Tokyo Story follows an aging couple, Tomi and Sukichi, on their journey from their rural village to visit their two married children in bustling, postwar Tokyo. Their reception is disappointing: too busy to entertain them, their children send them off to a health spa. After Tomi falls ill she and Sukichi return home, while the children, grief-stricken, hasten to be with her. From a simple tale unfolds one of the greatest of all Japanese films. Starring Ozu regulars Chishu Ryu and Setsuko Hara, the film reprises one of the director’s favorite themes—that of generational conflict—in a way that is quintessentially Japanese and yet so universal in its appeal that it continues to resonate as one of cinema’s greatest masterpieces. [Janus Films]",8.1,147,Yasujirô Ozu,"Kôgo Noda, Yasujirô Ozu",2 h 16 m,Drama
4,4,The Leopard (re-release),"Aug 13, 2004","Set in Sicily in 1860, Luchino Visconti's spectacular 1963 adaptation of Giuseppe di Lampedusa's international bestseller is one of the cinema's greatest evocations of the past, achingly depicting the passing of an ancient order. (Film Forum)",7.8,85,Luchino Visconti,"Giuseppe Tomasi di Lampedusa, Suso Cecchi D'Amico, Pasquale Festa Campanile, Enrico Medioli, Massimo Franciosa, Luchino Visconti",3 h 7 m,"Drama,History"


In [5]:
# Bad columns
films = films.drop(columns = ["Unnamed: 0"])
#Unwanted columns
films = films.drop(columns = ["Directed by", "Written by"])

In [6]:
films = films.dropna()

In [7]:
films["Release Date"] = pd.to_datetime(films["Release Date"])

In [8]:
duration_df = films["Duration"].str.extract(r"(\d+) h (\d+) m", expand=False)

duration_df = duration_df.fillna(0)

duration_df["total_duration"] = duration_df.apply(
    lambda x: int(x[0]) * 60 + int(x[1]), axis=1)

In [9]:
films["Duration"] = duration_df["total_duration"]

films = films[films["Duration"] > 60]

In [10]:
films.head()

Unnamed: 0,Title,Release Date,Description,Rating,No of Persons Voted,Duration,Genres
0,Dekalog (1988),1996-03-22,"This masterwork by Krzysztof Kieślowski is one of the twentieth century’s greatest achievements in visual storytelling. Originally made for Polish television, Dekalog focuses on the residents of a housing complex in late-Communist Poland, whose lives become subtly intertwined as they face emotional dilemmas that are at once deeply personal and universally human. Its ten hour-long films, drawing from the Ten Commandments for thematic inspiration and an overarching structure, grapple deftly with complex moral and existential questions concerning life, death, love, hate, truth, and the passage of time. Shot by nine different cinematographers, with stirring music by Zbigniew Preisner and compelling performances from established and unknown actors alike, Dekalog arrestingly explores the unknowable forces that shape our lives. Also available are the longer theatrical versions of the series’ fifth and sixth films: A Short Film About Killing and A Short Film About Love. [Janus Films]",7.4,118,572,Drama
1,Three Colors: Red,1994-11-23,"Krzysztof Kieslowski closes his Three Colors trilogy in grand fashion, with an incandescent meditation on fate and chance, starring Irène Jacob as a sweet-souled yet somber runway model in Geneva whose life dramatically intersects with that of a bitter retired judge, played by Jean-Louis Trintignant. Meanwhile, just down the street, a seemingly unrelated story of jealousy and betrayal unfolds. Red is an intimate look at forged connections and a splendid final statement from a remarkable filmmaker at the height of his powers. [Criterion]",8.3,241,99,"Drama,Mystery,Romance"
2,The Conformist,1970-10-22,"Set in Rome in the 1930s, this re-release of Bernardo Bertolucci's 1970 breakthrough feature stars Jean-Louis Trintignant as a Mussolini operative sent to Paris to locate and eliminate an old professor who fled Italy when the fascists came to power.",7.3,106,107,Drama
3,Tokyo Story,1972-03-13,"Yasujiro Ozu’s Tokyo Story follows an aging couple, Tomi and Sukichi, on their journey from their rural village to visit their two married children in bustling, postwar Tokyo. Their reception is disappointing: too busy to entertain them, their children send them off to a health spa. After Tomi falls ill she and Sukichi return home, while the children, grief-stricken, hasten to be with her. From a simple tale unfolds one of the greatest of all Japanese films. Starring Ozu regulars Chishu Ryu and Setsuko Hara, the film reprises one of the director’s favorite themes—that of generational conflict—in a way that is quintessentially Japanese and yet so universal in its appeal that it continues to resonate as one of cinema’s greatest masterpieces. [Janus Films]",8.1,147,136,Drama
4,The Leopard (re-release),2004-08-13,"Set in Sicily in 1860, Luchino Visconti's spectacular 1963 adaptation of Giuseppe di Lampedusa's international bestseller is one of the cinema's greatest evocations of the past, achingly depicting the passing of an ancient order. (Film Forum)",7.8,85,187,"Drama,History"


In [11]:
films.dtypes

Title                  object        
Release Date           datetime64[ns]
Description            object        
Rating                 float64       
No of Persons Voted    object        
Duration               int64         
Genres                 object        
dtype: object

In [12]:
films["No of Persons Voted"]

0        118
1        241
2        106
3        147
4        85 
         .. 
16277    21 
16278    302
16279    248
16280    485
16282    10 
Name: No of Persons Voted, Length: 12505, dtype: object

In [13]:
films["No of Persons Voted"] = films["No of Persons Voted"].apply(lambda x: int("".join(l for l in x if l.isdigit())))

In [14]:
films["No of Persons Voted"]

0        118
1        241
2        106
3        147
4        85 
         .. 
16277    21 
16278    302
16279    248
16280    485
16282    10 
Name: No of Persons Voted, Length: 12505, dtype: int64

In [15]:
films.dtypes

Title                  object        
Release Date           datetime64[ns]
Description            object        
Rating                 float64       
No of Persons Voted    int64         
Duration               int64         
Genres                 object        
dtype: object

In [16]:
films.columns = films.columns.str.lower().str.replace(" ", "_")
films = films.rename(columns = {"no_of_person_voted" : "votes"})

## Text cleaning

In [17]:
keywords = films["description"]

In [18]:
# Remove stopwords (grammatically but not semantically valuable)
# bigrams
# misspelling
# synonyms/homonyms/multiple definitions
# cases
# plurals punctuation

In [19]:
keywords = keywords.str.lower()

In [20]:
stop_words = stopwords.words("english")
stop_words.extend(["'ve", "nt", "re-release", "starring", "directed", "award", "adaptation"])

In [21]:
keyword_tokens = keywords.apply(word_tokenize)

In [22]:
def remove_unwanted_words(tokens: list[str]) -> list[str]:
    """Returns a list of tokens filtered for undesirables."""
    return [t for t in tokens if t not in stop_words and len(t) >= 3 and not t.isdigit()]

keyword_tokens = keyword_tokens.apply(remove_unwanted_words)

In [23]:
#c_vec = CountVectorizer(max_features=10000)
t_vec = TfidfVectorizer(max_features=5000)

# boolean - is the word there
# count - how many times is the word there
# how much is the word there relative to how common the word is

In [24]:
keyword_strings = keyword_tokens.apply(" ".join)
keyword_vectors = t_vec.fit_transform(keyword_strings)

In [25]:
keyword_vectors.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(12505, 5000))

In [26]:
#c_vec.get_feature_names_out()

# Similarity 

In [27]:
cosine_similarity([[0, 0, 1]], [[1, 1, 0], [0, 0, 1], [1, 0, 1]])

array([[0.        , 1.        , 0.70710678]])

In [28]:
films.iloc[5734]

title                  Café Society                                                                                                                                                                                                                
release_date           2016-07-15 00:00:00                                                                                                                                                                                                         
description            A young man (Jesse Eisenberg) arrives in Hollywood during the 1930s hoping to work in the film industry. There, he falls in love, and finds himself swept up in the vibrant café society that defined the spirit of the age.
rating                 6.6                                                                                                                                                                                                                         
no_of_persons_voted    1

In [29]:
keyword_vectors.shape

(12505, 5000)

In [30]:
#keyword_vectors[5734].toarray()

In [31]:
joana_film = keyword_vectors[5734]

In [32]:
joana_film_similarities = cosine_similarity(joana_film, keyword_vectors)

In [33]:
films["j_score"] = joana_film_similarities[0]

In [34]:
films.sort_values("j_score", ascending=False)[["title", "j_score"]].head(10)

Unnamed: 0,title,j_score
7564,Café Society,1.0
658,Los Angeles Plays Itself,0.182663
3978,Anaïs in Love,0.171778
6245,1984,0.1706
8942,Celeste and Jesse Forever,0.163814
10542,Liberal Arts,0.156539
1960,Mank,0.156092
4481,Mulan,0.15224
4482,Mulan,0.15224
13910,At First Sight,0.150272


In [35]:
ben_vector = t_vec.transform(["magic dragons wizards fire space"])

In [36]:
ben_vector

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4 stored elements and shape (1, 5000)>

In [37]:
films["b_score"] = cosine_similarity(ben_vector, keyword_vectors)[0]

In [38]:
films[["title", "b_score"]].sort_values("b_score", ascending=False).head(10)

Unnamed: 0,title,b_score
2756,How to Train Your Dragon 2,0.30802
14180,SpaceCamp,0.302184
16062,Wing Commander: Space Will Never Be the Same,0.235717
13065,Fire and Ice,0.233324
3493,Raya and the Last Dragon,0.23298
15351,Your Highness,0.226204
4067,Fire Will Come,0.212788
2080,Quest for Fire,0.208947
10180,Return to Space,0.20812
13457,Firehouse Dog,0.198351


In [39]:
keyword_tokens[12588]

['larry',
 'daley',
 'ben',
 'stiller',
 'heads',
 'london',
 'revitalize',
 'magic',
 'life-giving',
 'tablet']