In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from fuzzywuzzy import fuzz
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import ast
import json



In [2]:
# Read in data
character_metadata = pd.read_csv('./character.metadata.tsv', sep='\t', header=None)
movie_metadata = pd.read_csv('./movie.metadata.tsv', sep='\t', header=None)
name_clusters = pd.read_csv('./name.clusters.txt', sep='\t', header=None)
plot_summaries = pd.read_csv('./plot_summaries.txt', sep='\t', header=None)
tvtropes_orig = pd.read_csv('./tvtropes.clusters.txt', sep='\t', header=None)
tvtropes_external = json.load(open('./tvtropes_20200302.json'))

In [3]:
# Rename columns    
columns_movie_metadata = ['Wikipedia_movie_ID','Freebase_movie_ID','Movie_name',
                          'Movie_release_date','Movie_box_office_revenue','Movie_runtime',
                          'Movie_languages','Movie_countries','Movie_genres']
columns_character_metadata =['Wikipedia_movie_ID','Freebase_movie_ID','Movie_release_date','Character_name',
                             'Actor_DOB','Actor_Gender','Actor_Height', 'Actor_Ethnicity', 'Actor_Name',
                            'Actor_Age_at_Movie_Release', 'Freebase_Character_Actor_Map_ID',
                            'Freebase_Character_ID', 'Freebase_Actor_ID']
movie_metadata.columns = columns_movie_metadata
character_metadata.columns = columns_character_metadata
plot_summaries.columns = ['Wikipedia_movie_ID', 'Plot_summary']

In [4]:
movie_metadata.head()

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [5]:
print(movie_metadata.shape)

(81741, 9)


In [6]:
character_metadata.head()

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_release_date,Character_name,Actor_DOB,Actor_Gender,Actor_Height,Actor_Ethnicity,Actor_Name,Actor_Age_at_Movie_Release,Freebase_Character_Actor_Map_ID,Freebase_Character_ID,Freebase_Actor_ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [7]:
#Get the movie names from the dictionary keys of json file
movie_names_tropes = list(tvtropes_external.keys())

# Convert the dictionary into a list of key-value pairs
items = list(tvtropes_external.items())

# Create a DataFrame from the list
movie_tropes_df = pd.DataFrame(items, columns=['Movie_name', 'Tropes'])

# Display the DataFrame
print(movie_tropes_df.shape)

(12567, 2)


In [33]:
#get all the different tropes from the json file
tropes = []
for movie in movie_names_tropes:
    for trope in tvtropes_external[movie]:
        if trope not in tropes:
            tropes.append(trope)
print(len(tropes))

37317


In [8]:
movie_tropes_df.head()

Unnamed: 0,Movie_name,Tropes
0,ABBATheMovie,"[ActuallyPrettyFunny, Adorkable, AlmostKiss, A..."
1,ABCsOfDeath2,"[AbusiveParents, AirVentPassageway, AllWomenAr..."
2,ABNKKBSNPLAko,"[ComingOfAgeStory, FilmsOf20102014, GayBestFri..."
3,ABeautifulDayInTheNeighborhood,"[AbusiveParents, ActorAllusion, AdultFear, Art..."
4,ABeautifulMind,"[AbsentMindedProfessor, AbsentmindedProfessor,..."


In [9]:
#copy the movie_metadata dataframe and remove spaces from the movie names
movie_metadata_copy = movie_metadata.copy()
movie_metadata_copy['Movie_name'] = movie_metadata_copy['Movie_name'].str.replace(' ', '')
#find duplicate movie names
duplicate_movie_names = movie_metadata_copy[movie_metadata_copy.duplicated(['Movie_name'],keep=False)]
duplicate_movie_names.head()


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
5,13696889,/m/03cfc81,TheGangsters,1913-05-29,,35.0,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen..."
6,18998739,/m/04jcqvw,TheSorcerer'sApprentice,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant..."
13,171005,/m/016ywb,HenryV,1989-11-08,10161099.0,137.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/04xvh5"": ""Costume drama"", ""/m/082gq"": ""Wa..."
24,31983669,/m/0g4_n3m,RoadtoLife,1931-09-30,,104.0,"{""/m/06b_j"": ""Russian Language""}","{""/m/05vz3zq"": ""Soviet Union""}","{""/m/07s9rl0"": ""Drama""}"
43,26878691,/m/0f400r,MysteriousIsland,1982,,100.0,"{""/m/0653m"": ""Standard Mandarin""}","{""/m/03h64"": ""Hong Kong""}","{""/m/03btsm8"": ""Action/Adventure"", ""/m/08322"":..."
...,...,...,...,...,...,...,...,...,...
81716,25919941,/m/0b6lqy1,TheKreutzerSonata,2008,,99.0,{},"{""/m/09c7w0"": ""United States of America""}","{""/m/068d7h"": ""Romantic drama"", ""/m/02l7c8"": ""..."
81719,20244619,/m/04_0j2b,Mirage,1972,,82.0,"{""/m/06nm1"": ""Spanish Language""}","{""/m/016wzw"": ""Peru""}","{""/m/07s9rl0"": ""Drama"", ""/m/03q4nz"": ""World ci..."
81727,24209227,/m/07k5mlk,"TheTime,thePlaceandtheGirl",1946,,105.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/05p553"": ""Comedy film"", ""/m/02l7c8"": ""Rom..."
81733,23851782,/m/06_vb43,TheGhostTrain,1941-05-03,,82.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/01jfsb"": ""Th..."


In [10]:
#test how many movies are in both datasets movie_metadata and tvtropes
test = movie_metadata_copy['Movie_name'].isin(movie_tropes_df['Movie_name'])
test.value_counts()

False    74893
True      6848
Name: Movie_name, dtype: int64

In [16]:
#remove duplicates from movie_metadata_copy
movie_no_dupl = movie_metadata_copy = movie_metadata_copy.drop_duplicates(subset=['Movie_name'], keep=False)

In [17]:
#test how mani movies are in both datasets movie_metadata and tvtropes
test2 = movie_no_dupl['Movie_name'].isin(movie_tropes_df['Movie_name'])
test2.value_counts()

False    66645
True      4429
Name: Movie_name, dtype: int64

In [18]:
#merge movie_metadata_copy and movie_tropes_df
movie_metadata_tropes = pd.merge(movie_no_dupl, movie_tropes_df, on='Movie_name', how='inner')
movie_metadata_tropes.head()

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Tropes
0,77856,/m/0kcn7,MaryPoppins,1964-08-27,102272727.0,139.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...","[ADamWeSting, ADamWestInG, ANYTHINGBUTTHAT, AS..."
1,156558,/m/014k4y,BabyBoy,2001-06-27,29381649.0,123.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...","[AfricanAmericanMedia, AintTooProudToBeg, Amer..."
2,261237,/m/01mrrd,TheGodsMustBeCrazy,1980,34331783.0,109.0,"{""/m/0x82"": ""Afrikaans Language"", ""/m/02h40lc""...","{""/m/0hzlz"": ""South Africa""}","{""/m/03k9fj"": ""Adventure"", ""/m/03btsm8"": ""Acti...","[AcciDentalPerVert, AcciDentalPervert, Acciden..."
3,7364373,/m/0kv612,TheCoveredWagon,1923,3500000.0,23.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/0253g1"": ""Epic...","[AmericanFilms, BirthDeathJuxtaposition, CallT..."
4,12788657,/m/02x4zpv,TheBostonStrangler,1968,17810894.0,116.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/02n4kr"": ""My...","[FilmsOfThe1960s, SerialRapist]"


In [11]:
#Want to check how many of the characters appear in each movie

# Step 1: Merge dataframes on 'Wikipedia_movie_ID'
character_summaries = character_metadata.merge(plot_summaries, on='Wikipedia_movie_ID', how='inner')
character_summaries = character_summaries.dropna(subset=['Character_name', 'Plot_summary'])

In [35]:
#get all the different tropes from the movie_metadata_tropes dataframe
tropes_in_movies = []
for trope in movie_metadata_tropes['Tropes']:
    for t in trope:
        if t not in tropes_in_movies:
            tropes_in_movies.append(t)


In [36]:
print(len(tropes_in_movies))

32232


In [38]:
#check which movies have 2 or more tropes in common with each other
movies_with_common_tropes = []
for i in range(len(movie_metadata_tropes)):
    for j in range(len(movie_metadata_tropes)):
        if i != j:
            if len(set(movie_metadata_tropes['Tropes'][i]) & set(movie_metadata_tropes['Tropes'][j])) >= 2:
                movies_with_common_tropes.append((movie_metadata_tropes['Movie_name'][i], movie_metadata_tropes['Movie_name'][j]))


In [41]:
#transform the list into a dataframe
movies_with_common_tropes_df = pd.DataFrame(movies_with_common_tropes, columns=['Movie1', 'Movie2'])

In [44]:
movies_with_common_tropes_df.shape

(8102832, 2)

In [45]:
#find which 5 tropes that appear most often in the movies
tropes_count = {}
for trope in tropes_in_movies:
    tropes_count[trope] = 0
    for movie in movie_metadata_tropes['Tropes']:
        if trope in movie:
            tropes_count[trope] += 1
tropes_count_sorted = sorted(tropes_count.items(), key=lambda x: x[1], reverse=True)
tropes_count_sorted[:5]


[('AmericanFilms', 2805),
 ('ShoutOut', 1388),
 ('BoxOfficeBomb', 907),
 ('BigBad', 893),
 ('ChekhovsGun', 734)]

In [49]:
#find the movies that have unique tropes and the movies they are in
unique_tropes = []
for trope in tropes_in_movies:
    if tropes_count[trope] == 1:
        unique_tropes.append(trope)
unique_tropes_movies = movie_metadata_tropes[movie_metadata_tropes['Tropes'].apply(lambda x: any(item for item in unique_tropes if (item in x)))]
unique_tropes_movies.head()


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Tropes
0,77856,/m/0kcn7,MaryPoppins,1964-08-27,102272727.0,139.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...","[ADamWeSting, ADamWestInG, ANYTHINGBUTTHAT, AS..."
2,261237,/m/01mrrd,TheGodsMustBeCrazy,1980,34331783.0,109.0,"{""/m/0x82"": ""Afrikaans Language"", ""/m/02h40lc""...","{""/m/0hzlz"": ""South Africa""}","{""/m/03k9fj"": ""Adventure"", ""/m/03btsm8"": ""Acti...","[AcciDentalPerVert, AcciDentalPervert, Acciden..."
7,336846,/m/01xlqd,Grease,1978-06-16,394589888.0,225.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/0hn10"": ""L...","[ADateWithRosiePalms, AbridgedForChildren, Acc..."
8,3033993,/m/08m1s2,BecomingJane,2007-03-02,37311672.0,112.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/017fp"": ""Biography"", ""/m/04xvh5"": ""Costum...","[BritishFilms, DidNotGetTheGirl, DoubleMeaning..."
10,1940449,/m/067p6m,RoboCop3,1993-11-05,10600000.0,105.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/01jfsb"": ""Th...","[ATeamFIring, ATeamFiring, AbsurdlySharpBlade,..."


In [50]:
unique_tropes_movies.shape

(1880, 10)