In [1]:
import pandas as pd
from Levenshtein import distance

In [2]:
# title,year,popularity,directedBy,starring,avgRating,imdbId,item_id
movies = pd.read_json('data/movies.json', lines=True, dtype={'imdbId': str})
movies.head()

Unnamed: 0,title,year,popularity,directedBy,starring,avgRating,imdbId,item_id
0,The Matrix (1999),1999,28693,"Andy Wachowski, Larry Wachowski","Laurence Fishburne, Keanu Reeves, Hugo Weaving...",4.15952,tt0133093,2571
1,The Shawshank Redemption (1994),1994,27201,Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",4.41985,tt0111161,318
2,Inception (2010),2010,26153,Christopher Nolan,"Leonardo DiCaprio, Ken Watanabe, Joseph Gordon...",4.17404,tt1375666,79132
3,Forrest Gump (1994),1994,24417,Robert Zemeckis,"Tom Hanks, Gary Sinise, Mykelti Williamson, Ro...",4.06633,tt0109830,356
4,The Lord of the Rings: The Return of the King ...,2003,23997,Peter Jackson,"Sean Astin, Ian McKellen, Viggo Mortensen, Eli...",4.10393,tt0167260,7153


In [3]:
# Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
plots = pd.read_csv('original-data/wiki_movie_plots_deduped.csv')
plots.tail()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
34881,2014,The Water Diviner,Turkish,Director: Russell Crowe,Director: Russell Crowe\r\nCast: Russell Crowe...,unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...
34885,2017,İstanbul Kırmızısı,Turkish,Ferzan Özpetek,"Halit Ergenç, Tuba Büyüküstün, Mehmet Günsür, ...",romantic,https://en.wikipedia.org/wiki/%C4%B0stanbul_K%...,The writer Orhan Şahin returns to İstanbul aft...


In [4]:
# Rename plots columns to match movies columns where possible
plots = plots.rename(columns={'Release Year': 'year', 'Title': 'title', 'Origin/Ethnicity': 'origin', 'Director': 'directedBy', 'Cast': 'starring', 'Genre': 'genre', 'Wiki Page': 'wikiPage', 'Plot': 'plot'})
plots.tail()

Unnamed: 0,year,title,origin,directedBy,starring,genre,wikiPage,plot
34881,2014,The Water Diviner,Turkish,Director: Russell Crowe,Director: Russell Crowe\r\nCast: Russell Crowe...,unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...
34885,2017,İstanbul Kırmızısı,Turkish,Ferzan Özpetek,"Halit Ergenç, Tuba Büyüküstün, Mehmet Günsür, ...",romantic,https://en.wikipedia.org/wiki/%C4%B0stanbul_K%...,The writer Orhan Şahin returns to İstanbul aft...


In [5]:
# For faster lookup, create a dictionary with keys being release year and values being the rows of the movies dataframe
movies_dict = {}
for index, row in movies.iterrows():
    if row['year'] not in movies_dict:
        movies_dict[row['year']] = []
    movies_dict[row['year']].append(row)


In [6]:
def titles_distance(plot_title, movie_title):
    # Some titles have multiple names, separated by " (a.k.a. " in case of movie_title and " (aka " in case of plot_title
    # This function calculates the best Levenshtein distance between the possible combinations of titles

    movie_split_expression = " (a.k.a. "
    if movie_split_expression in movie_title:
        movie_titles = movie_title.split(movie_split_expression)
        movie_titles[1] = movie_titles[1][:-1]  # Remove last parenthesis
    else:
        movie_titles = [movie_title]
        
    plot_split_expression = " (aka "
    if plot_split_expression in plot_title:
        plot_titles = plot_title.split(plot_split_expression)
        plot_titles[1] = plot_titles[1][:-1]  # Remove last parenthesis
    else:
        plot_titles = [plot_title]
        
    min_dist = min([distance(plot_title, movie_title) for movie_title in movie_titles for plot_title in plot_titles])
    return min_dist
    

def get_best_match(plot_row, movies_dict, ideal_distance=0):
    # Iterates over plots and finds the corresponding movie using Levenshtein distance
    plot_title, year = plot_row['title'], plot_row['year']
    if year not in movies_dict: return None
    
    # Function that removes year from movie title
    remove_year = lambda x: x[:-7]
    
    possible_movies = movies_dict[year]
    for movie in possible_movies:
        movie_title = remove_year(movie['title'])
        dist = titles_distance(plot_title, movie_title)
        if dist <= ideal_distance:
            return movie
    
    return None 
    
def match_titles(plots, movies_dict):
    # Adds an item_id column to the plots dataframe with the corresponding item_id from the movies dataframe
    plots['item_id'] = None
    
    for ideal_distance in range(0, 4):
        for index, row in plots.iterrows():
            if row['item_id'] is not None: continue
            
            movie = get_best_match(row, movies_dict, ideal_distance)
            if movie is not None:
                plots.at[index, 'item_id'] = movie['item_id']
                
                year = movie['year']
                movies_dict[year] = [m for m in movies_dict[year] if m['item_id'] != movie['item_id']]
                
                
    return plots

plots = match_titles(plots, movies_dict)
plots.tail()

Unnamed: 0,year,title,origin,directedBy,starring,genre,wikiPage,plot,item_id
34881,2014,The Water Diviner,Turkish,Director: Russell Crowe,Director: Russell Crowe\r\nCast: Russell Crowe...,unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ...",117871.0
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the...",
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i...",
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...,
34885,2017,İstanbul Kırmızısı,Turkish,Ferzan Özpetek,"Halit Ergenç, Tuba Büyüküstün, Mehmet Günsür, ...",romantic,https://en.wikipedia.org/wiki/%C4%B0stanbul_K%...,The writer Orhan Şahin returns to İstanbul aft...,


In [7]:
# Rearrange plots columns so that item_id is the first column
cols = ['item_id'] + [col for col in plots if col != 'item_id']
plots = plots[cols]
plots.tail()

Unnamed: 0,item_id,year,title,origin,directedBy,starring,genre,wikiPage,plot
34881,117871.0,2014,The Water Diviner,Turkish,Director: Russell Crowe,Director: Russell Crowe\r\nCast: Russell Crowe...,unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ..."
34882,,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...
34885,,2017,İstanbul Kırmızısı,Turkish,Ferzan Özpetek,"Halit Ergenç, Tuba Büyüküstün, Mehmet Günsür, ...",romantic,https://en.wikipedia.org/wiki/%C4%B0stanbul_K%...,The writer Orhan Şahin returns to İstanbul aft...


In [8]:
# how many plots have been matched
print(plots['item_id'].notnull().sum())

9147


In [9]:
# remove rows with null item_id
plots = plots[plots['item_id'].notnull()] 

In [10]:
# Sort the plots dataframe by item_id
plots = plots.sort_values(by='item_id')

In [11]:
# save the plots dataframe to a csv file
plots.to_csv('data/movie_plots.csv', index=False)