### Merge with IMDB ratings dataset

In [1]:
# Import Criterion dataset
import pandas as pd
data = pd.read_csv('data\Criterion.csv')

In [2]:
# Convert year to string
data['Year'] = data['Year'].astype(str)

In [3]:
# Combine title and year for matching
data['titleYear'] = data['Title'] + ' ' + data['Year']

In [4]:
len(data)

1620

In [5]:
data.head()

Unnamed: 0,Title,Director,Country,Year,Decade,Duration,Description,Total Hours,Url,titleYear
0,2 or 3 Things I Know About Her,Jean-Luc Godard,France,1967,1960s,1:27,In 2 OR 3 THINGS I KNOW ABOUT HER (2 OU 3 CHOS...,1.45,https://www.criterionchannel.com/2-or-3-things...,2 or 3 Things I Know About Her 1967
1,3 Faces,Jafar Panahi,Iran,2018,2010s,1:40,Iranian master Jafar Panahi’s fourth feature s...,1.67,https://www.criterionchannel.com/3-faces,3 Faces 2018
2,"4 Months, 3 Weeks and 2 Days",Cristian Mungiu,Romania,2007,2000s,1:53,Romanian filmmaker Cristian Mungiu shot to int...,1.88,https://www.criterionchannel.com/4-months-3-we...,"4 Months, 3 Weeks and 2 Days 2007"
3,"The VI Olympic Winter Games, Oslo 1952",Tankred Ibsen,Norway,1952,1950s,1:43,Director Tancred Ibsen's penchant for depictin...,1.72,https://www.criterionchannel.com/the-vi-olympi...,"The VI Olympic Winter Games, Oslo 1952 1952"
4,8½,Federico Fellini,Italy,1963,1960s,2:19,"Marcello Mastroianni plays Guido Anselmi, a di...",2.32,https://www.criterionchannel.com/81-2,8½ 1963


In [11]:
# Import IMDB dataset
ratings = pd.read_csv('title.ratings.tsv', sep = '\t')
basics = pd.read_csv('title.basics.tsv', sep = '\t', low_memory = False)
basics = basics.drop(['titleType', 'isAdult', 'endYear', 'runtimeMinutes', 'genres'], axis = 1)

In [12]:
# Merge to obtain ratings + titles
imdb_merged = ratings.merge(basics, left_on = 'tconst', right_on = 'tconst')

In [13]:
# Combine title and year for matching
imdb_merged['titleYear'] = imdb_merged['primaryTitle'] + ' ' + imdb_merged['startYear']

In [14]:
len(imdb_merged)

1134796

In [15]:
# With duplicate titleYear, keep the one with the most numVotes
imdb_merged = imdb_merged.sort_values(['titleYear', 'numVotes'], ascending = False)
imdb_merged['isDuplicated'] = imdb_merged.duplicated('titleYear', keep = 'first')
imdb_merged = imdb_merged[imdb_merged['isDuplicated'] == False]

In [16]:
len(imdb_merged)

1061173

In [17]:
imdb_merged.head()

Unnamed: 0,tconst,averageRating,numVotes,primaryTitle,originalTitle,startYear,titleYear,isDuplicated
596512,tt1381887,7.1,83,Солодкі мрії,Sweet Dreams,2008,Солодкі мрії 2008,False
735985,tt2234575,6.8,32,ö,ö,2012,ö 2012,False
577900,tt13273532,8.2,8,êmîcêtôcêt: Many Bloodlines,êmîcêtôcêt: Many Bloodlines,2020,êmîcêtôcêt: Many Bloodlines 2020,False
184529,tt0317414,5.6,15,él,él,2001,él 2001,False
809206,tt3207532,5.7,17,éX-Driver the Movie,éX-Driver the Movie,2002,éX-Driver the Movie 2002,False


In [18]:
# Sanity check
imdb_merged[imdb_merged['titleYear'] == 'Weekend 2011']

Unnamed: 0,tconst,averageRating,numVotes,primaryTitle,originalTitle,startYear,titleYear,isDuplicated
660678,tt1714210,7.6,28314,Weekend,Weekend,2011,Weekend 2011,False


### TD-IDF Cosine Similarity String Matching
https://github.com/Bergvca/string_grouper

In [19]:
# Import string_grouper
from string_grouper import match_strings, \
match_most_similar, group_similar_strings, \
compute_pairwise_similarities, StringGrouper

To do: Find a way to match on both primaryTitle and originalTitle

In [20]:
# Match on titleYear
matches = match_most_similar(imdb_merged['titleYear'], data['titleYear'])

In [21]:
matches

Unnamed: 0,most_similar_index,most_similar_titleYear
0,38571.0,2 or 3 Things I Know About Her 1967
1,1079513.0,3 Faces 2018
2,454250.0,"4 Months, 3 Weeks and 2 Days 2007"
3,137372.0,"The VI Olympic Winter Games, Oslo 1952 1952"
4,35517.0,8½ 1963
...,...,...
1615,149722.0,Y Tu Mamá También 2001
1616,42766.0,Z 1969
1617,33489.0,Zazie dans le Métro 1960
1618,141852.0,Zero Focus 1961


In [22]:
# Drop NAs (no match)
nas = matches[matches['most_similar_index'].isna()]
nas = nas['most_similar_titleYear']
matches = matches.dropna(axis = 0)

In [23]:
# Check NAs
nas

10                                 21 Days 1940
15                    THE 47 RONIN: Part 1 1941
19                                  Abouna 2002
22                      An Actor’s Revenge 1963
24                 Adventures of a Dentist 1965
                         ...                   
1593               WORLD ON A WIRE: Part 1 1973
1598                               Xiao Wu 1997
1600                                Yeelen 1987
1611                         Youth in Fury 1960
1613    You Were Like a Wild Chrysanthemum 1955
Name: most_similar_titleYear, Length: 186, dtype: object

In [24]:
matches.head()

Unnamed: 0,most_similar_index,most_similar_titleYear
0,38571.0,2 or 3 Things I Know About Her 1967
1,1079513.0,3 Faces 2018
2,454250.0,"4 Months, 3 Weeks and 2 Days 2007"
3,137372.0,"The VI Olympic Winter Games, Oslo 1952 1952"
4,35517.0,8½ 1963


In [25]:
# Remove NAs from data
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')
data = data[~data['titleYear'].str.contains('|'.join(nas))]

In [26]:
# Rename Criterion titles based on IMDB titles
pd.options.mode.chained_assignment = None
data['newTitleYear'] = ''
for i in range(0, data.index[-1] + 1):
    try:
        index = int(matches.loc[i, 'most_similar_index'])
    except:
        pass
    data.loc[i, 'newTitleYear'] = imdb_merged.loc[index, 'titleYear']

In [27]:
len(data)

1620

In [28]:
data = data.dropna(axis = 0)

In [29]:
len(data)

1436

In [30]:
data

Unnamed: 0,Title,Director,Country,Year,Decade,Duration,Description,Total Hours,Url,titleYear,newTitleYear
0,2 or 3 Things I Know About Her,Jean-Luc Godard,France,1967,1960s,1:27,In 2 OR 3 THINGS I KNOW ABOUT HER (2 OU 3 CHOS...,1.45,https://www.criterionchannel.com/2-or-3-things...,2 or 3 Things I Know About Her 1967,2 or 3 Things I Know About Her 1967
1,3 Faces,Jafar Panahi,Iran,2018,2010s,1:40,Iranian master Jafar Panahi’s fourth feature s...,1.67,https://www.criterionchannel.com/3-faces,3 Faces 2018,3 Faces 2018
2,"4 Months, 3 Weeks and 2 Days",Cristian Mungiu,Romania,2007,2000s,1:53,Romanian filmmaker Cristian Mungiu shot to int...,1.88,https://www.criterionchannel.com/4-months-3-we...,"4 Months, 3 Weeks and 2 Days 2007","4 Months, 3 Weeks and 2 Days 2007"
3,"The VI Olympic Winter Games, Oslo 1952",Tankred Ibsen,Norway,1952,1950s,1:43,Director Tancred Ibsen's penchant for depictin...,1.72,https://www.criterionchannel.com/the-vi-olympi...,"The VI Olympic Winter Games, Oslo 1952 1952","The VI Olympic Winter Games, Oslo 1952 1952"
4,8½,Federico Fellini,Italy,1963,1960s,2:19,"Marcello Mastroianni plays Guido Anselmi, a di...",2.32,https://www.criterionchannel.com/81-2,8½ 1963,8½ 1963
...,...,...,...,...,...,...,...,...,...,...,...
1615,Y tu mamá también,Alfonso Cuarón,Mexico,2001,2000s,1:45,This smash road comedy from Oscar-winning dire...,1.75,https://www.criterionchannel.com/y-tu-mama-tam...,Y tu mamá también 2001,Y Tu Mamá También 2001
1616,Z,Costa-Gavras,Greece,1969,1960s,2:07,"A pulse-pounding political thriller, Greek exp...",2.12,https://www.criterionchannel.com/z-1,Z 1969,Z 1969
1617,Zazie dans le métro,Louis Malle,France,1960,1960s,1:32,A brash and precocious ten-year-old (Catherine...,1.53,https://www.criterionchannel.com/zazie-dans-le...,Zazie dans le métro 1960,Zazie dans le Métro 1960
1618,Zero Focus,Yoshitaro Nomura,Japan,1961,1960s,1:35,After her husband disappears on a business tri...,1.58,https://www.criterionchannel.com/zero-focus,Zero Focus 1961,Zero Focus 1961


In [31]:
criterion_merged = data.merge(imdb_merged, left_on = 'newTitleYear', right_on = 'titleYear')

In [32]:
len(criterion_merged)

1436

In [33]:
criterion_merged.sort_values('averageRating', ascending = False)

Unnamed: 0,Title,Director,Country,Year,Decade,Duration,Description,Total Hours,Url,titleYear_x,newTitleYear,tconst,averageRating,numVotes,primaryTitle,originalTitle,startYear,titleYear_y,isDuplicated
722,Look Back in Anger,Tony Richardson,United Kingdom,1958,1950s,1:40,Jimmy Porter (Richard Burton) is a university ...,1.67,https://www.criterionchannel.com/look-back-in-...,Look Back in Anger 1958,Look Back in Anger 1958,tt0272009,9.5,8,Look Back in Anger,Blick zurück im Zorn,1958,Look Back in Anger 1958,False
942,Pather Panchali,Satyajit Ray,India,1955,1950s,2:05,With the release in 1955 of Satyajit Ray’s deb...,2.08,https://www.criterionchannel.com/pather-panchali,Pather Panchali 1955,Pather Panchali 1955,tt0048473,8.6,23962,Pather Panchali,Pather Panchali,1955,Pather Panchali 1955,False
520,Harakiri,Masaki Kobayashi,Japan,1962,1960s,2:12,"Following the collapse of his clan, an unemplo...",2.20,https://www.criterionchannel.com/harakiri,Harakiri 1962,Hara-Kiri 1962,tt0056058,8.6,43601,Hara-Kiri,Seppuku,1962,Hara-Kiri 1962,False
1110,Seven Samurai,Akira Kurosawa,Japan,1954,1950s,3:27,One of the most thrilling movie epics of all t...,3.45,https://www.criterionchannel.com/seven-samurai,Seven Samurai 1954,Seven Samurai 1954,tt0047478,8.6,318830,Seven Samurai,Shichinin no samurai,1954,Seven Samurai 1954,False
1304,Le trou,Jacques Becker,France,1960,1960s,2:11,"In a Paris prison cell, five inmates use every...",2.18,https://www.criterionchannel.com/le-trou-1,Le trou 1960,Le Trou 1960,tt0054407,8.5,15675,Le Trou,Le trou,1960,Le Trou 1960,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,I Was a Teenage Zombie,John Elias Michalakis,United States,1987,1980s,1:31,A group of teens looking to score some weed un...,1.52,https://www.criterionchannel.com/i-was-a-teena...,I Was a Teenage Zombie 1987,I Was a Teenage Zombie 1987,tt0093238,4.5,729,I Was a Teenage Zombie,I Was a Teenage Zombie,1987,I Was a Teenage Zombie 1987,False
333,Dont Look Back,D. A. Pennebaker,United States,1967,1960s,1:36,Bob Dylan is captured on-screen as he never wo...,1.60,https://www.criterionchannel.com/dont-look-back,Dont Look Back 1967,Dont Look Back 2018,tt9095892,4.2,24,Dont Look Back,Dont Look Back,2018,Dont Look Back 2018,False
98,Beware! The Blob,Larry Hagman,United States,1972,1970s,1:27,The Blob returns—and is more outrageous than e...,1.45,https://www.criterionchannel.com/beware-the-blob,Beware! The Blob 1972,Beware! The Blob 1972,tt0068271,4.1,1841,Beware! The Blob,Beware! The Blob,1972,Beware! The Blob 1972,False
25,All Monsters Attack,Ishiro Honda,Japan,1969,1960s,1:09,Director Ishiro Honda returned again for the f...,1.15,https://www.criterionchannel.com/all-monsters-...,All Monsters Attack 1969,All Monsters Attack 1969,tt0064373,3.9,4047,All Monsters Attack,Gojira-Minira-Gabara: Oru kaijû daishingeki,1969,All Monsters Attack 1969,False


In [34]:
# Save to csv
criterion_merged.to_csv('data\Merged.csv', index = False)