# Matching IMDB to Wikipedia

In [1]:
import pandas as pd
import re
import ast
import numpy as np
from rapidfuzz import process, fuzz

In [29]:
wiki_movies = pd.read_csv("clean_data//clean_wikipedia_franchises.csv", index_col = 0)
imdb_movies = pd.read_csv("raw_data//imdb_movies_db.csv", index_col = 0)

We need to change the year for imdb_movies so it's an integer. We'll first remove any without a value (signified by \n) and then convert to int

In [47]:
imdb_movies = imdb_movies[~(imdb_movies["startYear"] == "\\N")]
imdb_movies["startYear"] = imdb_movies["startYear"].apply(lambda x: int(x))

Convert the akas from a string to a list:

In [48]:
def extract_akas(row):
    # convert string in list format to list
    if type(row) == str:
        akas = ast.literal_eval(row)
        return akas

In [49]:
imdb_movies["akas"] = imdb_movies["akas"].apply(lambda x : extract_akas(x))

In [50]:
imdb_movies[imdb_movies["primaryTitle"] == imdb_movies["originalTitle"]]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas,all_names
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance,5.3,204.0,,"[Fräulein Jerry, Miss Jerry]"
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport",5.3,469.0,,"[Бой Корбетта и Фитцсиммонса, The Corbett-Fitz..."
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N,4.1,15.0,,[Bohemios]
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography",6.0,826.0,,"[Die Geschichte der Kelly Bande, Priča o Kelij..."
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama,4.3,24.0,,[Robbery Under Arms]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9849562,tt9916160,movie,Drømmeland,Drømmeland,0,2019,\N,72,Documentary,6.3,51.0,,[Drømmeland]
9849575,tt9916190,movie,Safeguard,Safeguard,0,2020,\N,95,"Action,Adventure,Thriller",3.7,242.0,,[Safeguard]
9849614,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,\N,84,Thriller,5.8,1418.0,,"[Il talento del calabrone, Il calabrone, The t..."
9849743,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,\N,123,Drama,8.6,7.0,,[Kuambil Lagi Hatiku]


Combine all the names into a new column

In [51]:
def combine_names(row):
    tempList = []
    tempList.append(row["primaryTitle"])
    tempList.append(row["originalTitle"])
    if isinstance(row["akas"], list):
        for x in row["akas"]:
            tempList.append(x)
    return tempList

imdb_movies["all_names"] = imdb_movies.apply(combine_names, axis=1)


Remove any duplicates in the all_names column

In [52]:
imdb_movies["all_names"] = imdb_movies["all_names"].apply(lambda x: list(set(x)))

In [53]:
imdb_movies = imdb_movies.explode("all_names")

### Merge the dataframes

Now we're going to combine the dataframes in stages. Matching based on year and name first, then any with an exact name, not in the same year. We'll gradually reduce the dataframes in size by filtering to ensure we aren't matching ones that have already been matched.

These filters will ensure we're only keeping the columns we need

In [59]:
wiki_filter = ["tconst", "titleType", "primaryTitle","originalTitle", "isAdult","startYear","endYear","runtimeMinutes","genres","averageRating","numVotes","akas","_merge", "all_names"]
imdb_filter = ["franchise_name", "franchise_id", "movie_name", "release_year","_merge"]

# Adam, I need you to explain what's happening here. While I get it, I'm really struggling to formulate it into words in a way that makes sense

In [60]:
def filter_df(df, merge, cols):
    remainder = df[df["_merge"]== merge]
    remainder = remainder.drop(cols, axis = 1)
    return remainder

Combines the 

In [61]:
all_merged = pd.merge(wiki_movies, imdb_movies, left_on=["movie_name", "release_year"], right_on=["all_names", "startYear"], how="outer", indicator = True)
wiki_remainder = filter_df(all_merged, "left_only", wiki_filter)
imdb_remainder = filter_df(all_merged, "right_only", imdb_filter)
all_merged[all_merged["_merge"]=="both"]

Unnamed: 0,franchise_id,franchise_name,movie_name,release_year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas,all_names,_merge
0,f0,The Aldrich Family,What a Life,1939.0,tt0032123,movie,What a Life,What a Life,0.0,1939.0,\N,75,"Comedy,Drama",6.9,94.0,,What a Life,both
1,f0,The Aldrich Family,Life with Henry,1940.0,tt0033834,movie,Life with Henry,Life with Henry,0.0,1940.0,\N,80,"Comedy,Family,Music",6.0,49.0,,Life with Henry,both
2,f0,The Aldrich Family,Henry Aldrich for President,1941.0,tt0033708,movie,Henry Aldrich for President,Henry Aldrich for President,0.0,1941.0,\N,75,"Comedy,Family",6.6,146.0,,Henry Aldrich for President,both
3,f0,The Aldrich Family,"Henry Aldrich, Editor",1942.0,tt0034842,movie,"Henry Aldrich, Editor","Henry Aldrich, Editor",0.0,1942.0,\N,72,"Comedy,Drama,Family",6.4,151.0,,"Henry Aldrich, Editor",both
4,f0,The Aldrich Family,Henry and Dizzy,1942.0,tt0034844,movie,Henry and Dizzy,Henry and Dizzy,0.0,1942.0,\N,71,"Comedy,Family",7.2,58.0,,Henry and Dizzy,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8152,f1344,Super Sentai,Kamen Rider × Super Sentai × Space Sheriff: Su...,2013.0,tt2632184,movie,Kamen Rider × Super Sentai × Space Sheriff: Su...,Kamen Raidâ × Sûpâ Sentai × Uchû Keiji: Supâ H...,0.0,2013.0,\N,92,"Action,Adventure,Comedy",6.3,129.0,,Kamen Rider × Super Sentai × Space Sheriff: Su...,both
8157,f1344,Super Sentai,Ressha Sentai ToQger vs. Kyoryuger: The Movie,2015.0,tt4152148,movie,Ressha Sentai ToQger vs. Kyoryuger: The Movie,Ressha Sentai Tokkyûjâ tai Kyôryûjâ Za Mûbî,0.0,2015.0,\N,64,Action,7.5,32.0,,Ressha Sentai ToQger vs. Kyoryuger: The Movie,both
8158,f1344,Super Sentai,Super Hero Taisen GP: Kamen Rider 3,2015.0,tt4282466,movie,Super Hero Taisen GP: Kamen Rider 3,Super Hero Taisen GP: Kamen Rider 3,0.0,2015.0,\N,95,Action,6.5,86.0,,Super Hero Taisen GP: Kamen Rider 3,both
8171,f1344,Super Sentai,Kishiryu Sentai Ryusoulger Special Chapter: Me...,2021.0,tt13681618,movie,Kishiryu Sentai Ryusoulger Special Chapter: Me...,Kishiryuu Sentai Ryuusoujâ Tokubetsuhen: Memor...,0.0,2021.0,\N,15,"Action,Fantasy",6.4,6.0,,Kishiryu Sentai Ryusoulger Special Chapter: Me...,both


In [67]:
all_merged_both = all_merged[all_merged["_merge"]=="both"]
all_merged_both = all_merged_both.drop_duplicates(subset = "tconst",keep = "first")
all_merged_both

Unnamed: 0,franchise_id,franchise_name,movie_name,release_year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas,all_names,_merge
0,f0,The Aldrich Family,What a Life,1939.0,tt0032123,movie,What a Life,What a Life,0.0,1939.0,\N,75,"Comedy,Drama",6.9,94.0,,What a Life,both
1,f0,The Aldrich Family,Life with Henry,1940.0,tt0033834,movie,Life with Henry,Life with Henry,0.0,1940.0,\N,80,"Comedy,Family,Music",6.0,49.0,,Life with Henry,both
2,f0,The Aldrich Family,Henry Aldrich for President,1941.0,tt0033708,movie,Henry Aldrich for President,Henry Aldrich for President,0.0,1941.0,\N,75,"Comedy,Family",6.6,146.0,,Henry Aldrich for President,both
3,f0,The Aldrich Family,"Henry Aldrich, Editor",1942.0,tt0034842,movie,"Henry Aldrich, Editor","Henry Aldrich, Editor",0.0,1942.0,\N,72,"Comedy,Drama,Family",6.4,151.0,,"Henry Aldrich, Editor",both
4,f0,The Aldrich Family,Henry and Dizzy,1942.0,tt0034844,movie,Henry and Dizzy,Henry and Dizzy,0.0,1942.0,\N,71,"Comedy,Family",7.2,58.0,,Henry and Dizzy,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8152,f1344,Super Sentai,Kamen Rider × Super Sentai × Space Sheriff: Su...,2013.0,tt2632184,movie,Kamen Rider × Super Sentai × Space Sheriff: Su...,Kamen Raidâ × Sûpâ Sentai × Uchû Keiji: Supâ H...,0.0,2013.0,\N,92,"Action,Adventure,Comedy",6.3,129.0,,Kamen Rider × Super Sentai × Space Sheriff: Su...,both
8157,f1344,Super Sentai,Ressha Sentai ToQger vs. Kyoryuger: The Movie,2015.0,tt4152148,movie,Ressha Sentai ToQger vs. Kyoryuger: The Movie,Ressha Sentai Tokkyûjâ tai Kyôryûjâ Za Mûbî,0.0,2015.0,\N,64,Action,7.5,32.0,,Ressha Sentai ToQger vs. Kyoryuger: The Movie,both
8158,f1344,Super Sentai,Super Hero Taisen GP: Kamen Rider 3,2015.0,tt4282466,movie,Super Hero Taisen GP: Kamen Rider 3,Super Hero Taisen GP: Kamen Rider 3,0.0,2015.0,\N,95,Action,6.5,86.0,,Super Hero Taisen GP: Kamen Rider 3,both
8171,f1344,Super Sentai,Kishiryu Sentai Ryusoulger Special Chapter: Me...,2021.0,tt13681618,movie,Kishiryu Sentai Ryusoulger Special Chapter: Me...,Kishiryuu Sentai Ryuusoujâ Tokubetsuhen: Memor...,0.0,2021.0,\N,15,"Action,Fantasy",6.4,6.0,,Kishiryu Sentai Ryusoulger Special Chapter: Me...,both


In [None]:
Get any 

In [68]:
imdb_remainder = imdb_remainder[~imdb_remainder["tconst"].isin(all_merged_both["tconst"])]

In [69]:
no_year = pd.merge(wiki_remainder, imdb_remainder, left_on=["movie_name"], right_on=["all_names"], how="outer", indicator = True)
wiki_remainder = filter_df(all_merged, "left_only", wiki_filter)
imdb_remainder = filter_df(all_merged, "right_only", imdb_filter)
no_year

Unnamed: 0,franchise_id,franchise_name,movie_name,release_year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas,all_names,_merge
0,f1,Coffin Joe,At Midnight I'll Take Your Soul,1963.0,,,,,,,,,,,,,,left_only
1,f1,Coffin Joe,This Night I'll Possess Your Corpse,1967.0,,,,,,,,,,,,,,left_only
2,f1,Coffin Joe,The End of Man,1970.0,,,,,,,,,,,,,,left_only
3,f1,Coffin Joe,The Bloody Exorcism of Coffin Joe,1974.0,,,,,,,,,,,,,,left_only
4,f1,Coffin Joe,Hellish Flesh,1977.0,,,,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352999,,,,,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0.0,2020.0,\N,84,Thriller,5.8,1418.0,,Il talento del calabrone,right_only
353000,,,,,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0.0,2019.0,\N,\N,"Adventure,History,War",3.8,14.0,,Hong xing zhao yao Zhong guo,right_only
353001,,,,,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0.0,2019.0,\N,\N,"Adventure,History,War",3.8,14.0,,The Secret of China,right_only
353002,,,,,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0.0,2019.0,\N,123,Drama,8.6,7.0,,Kuambil Lagi Hatiku,right_only


In [71]:
no_year_both = no_year[no_year["_merge"]=="both"]
no_year_both = no_year_both.drop_duplicates(subset = "tconst",keep = "first")
no_year_both["tconst"].nunique()

674

In [72]:
no_year_both['difference_in_years'] = no_year_both.apply(lambda x: abs(x['startYear'] - x['release_year']), axis=1)
no_year_both = no_year_both[no_year_both["difference_in_years"] <= 10]
no_year_both["tconst"].nunique()

346

In [73]:
no_year_both.head(50)

Unnamed: 0,franchise_id,franchise_name,movie_name,release_year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas,all_names,_merge,difference_in_years
77,f16,Young and Dangerous,Those Were the Days,2000.0,tt0114146,movie,Those Were the Days,Le plus bel âge...,0.0,1995.0,\N,85,Drama,6.5,209.0,,Those Were the Days,both,5.0
78,f16,Young and Dangerous,Those Were the Days,2000.0,tt0118165,movie,Those Were the Days,Wong Gok dik tin hung 2: Nam siu yee,0.0,1996.0,\N,89,"Action,Crime,Drama",5.1,17.0,,Those Were the Days,both,4.0
79,f16,Young and Dangerous,Those Were the Days,2000.0,tt0285244,movie,Those Were the Days,Jing zhuang nan xiong nan di,0.0,1997.0,\N,103,Comedy,6.5,142.0,,Those Were the Days,both,3.0
119,f20,Gamera,Gamera the Brave,2006.0,tt0467923,movie,Gamera the Brave,Chiisaki yûsha-tachi: Gamera,0.0,2005.0,\N,96,"Action,Adventure,Family",6.6,1305.0,,Gamera the Brave,both,1.0
132,f23,"L.E.T.H.A.L. Ladies (a.k.a. Triple-B, Bullets,...",Hard Hunted,1992.0,tt0104391,movie,Hard Hunted,Hard Hunted,0.0,1993.0,\N,97,"Action,Adventure,Crime",4.1,1256.0,,Hard Hunted,both,1.0
152,f27,"Signed, Sealed, Delivered",From Paris with Love,2015.0,tt1179034,movie,From Paris with Love,From Paris with Love,0.0,2010.0,\N,92,"Action,Crime,Thriller",6.4,119231.0,,From Paris with Love,both,5.0
154,f27,"Signed, Sealed, Delivered",Truth Be Told,2015.0,tt2190116,movie,Truth Be Told,Truth Be Told,0.0,2012.0,\N,105,"Biography,Documentary,Drama",8.0,25.0,,Truth Be Told,both,3.0
155,f27,"Signed, Sealed, Delivered",The Impossible Dream,2015.0,tt12194382,movie,The Impossible Dream,The Impossible Dream,0.0,2019.0,\N,90,Documentary,8.7,10.0,,The Impossible Dream,both,4.0
159,f27,"Signed, Sealed, Delivered",One in a Million,2016.0,tt13086344,movie,One in a Million,One in a Million,0.0,2022.0,\N,84,Documentary,7.5,12.0,,One in a Million,both,6.0
163,f27,"Signed, Sealed, Delivered",Higher Ground,2017.0,tt1562568,movie,Higher Ground,Higher Ground,0.0,2011.0,\N,109,Drama,6.2,3205.0,,Higher Ground,both,6.0


In [77]:
matched = pd.concat([all_merged_both, no_year_both])
matched = matched[matched["_merge"]=="both"]
matched.sort_values("numVotes", ascending = False).drop_duplicates(subset="franchise_id", keep="first")
matched

Unnamed: 0,franchise_id,franchise_name,movie_name,release_year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas,all_names,_merge,difference_in_years
0,f0,The Aldrich Family,What a Life,1939.0,tt0032123,movie,What a Life,What a Life,0.0,1939.0,\N,75,"Comedy,Drama",6.9,94.0,,What a Life,both,
1,f0,The Aldrich Family,Life with Henry,1940.0,tt0033834,movie,Life with Henry,Life with Henry,0.0,1940.0,\N,80,"Comedy,Family,Music",6.0,49.0,,Life with Henry,both,
2,f0,The Aldrich Family,Henry Aldrich for President,1941.0,tt0033708,movie,Henry Aldrich for President,Henry Aldrich for President,0.0,1941.0,\N,75,"Comedy,Family",6.6,146.0,,Henry Aldrich for President,both,
3,f0,The Aldrich Family,"Henry Aldrich, Editor",1942.0,tt0034842,movie,"Henry Aldrich, Editor","Henry Aldrich, Editor",0.0,1942.0,\N,72,"Comedy,Drama,Family",6.4,151.0,,"Henry Aldrich, Editor",both,
4,f0,The Aldrich Family,Henry and Dizzy,1942.0,tt0034844,movie,Henry and Dizzy,Henry and Dizzy,0.0,1942.0,\N,71,"Comedy,Family",7.2,58.0,,Henry and Dizzy,both,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4097,f1343,Hopalong Cassidy (American-Western),Sunset Trail,1939.0,tt0023539,movie,Sunset Trail,Sunset Trail,0.0,1932.0,\N,62,"Action,Music,Romance",6.2,42.0,,Sunset Trail,both,7.0
4098,f1343,Hopalong Cassidy (American-Western),Sunset Trail,1939.0,tt0030812,movie,Sunset Trail,Sunset Trail,0.0,1938.0,\N,69,"Drama,Western",6.9,182.0,,Sunset Trail,both,1.0
4100,f1343,Hopalong Cassidy (American-Western),Lost Canyon,1943.0,tt0034996,movie,Lost Canyon,Lost Canyon,0.0,1942.0,\N,61,"Drama,Western",6.2,163.0,,Lost Canyon,both,1.0
4102,f1343,Hopalong Cassidy (American-Western),Fool's Gold,1947.0,tt0038532,movie,Fool's Gold,Fool's Gold,0.0,1946.0,\N,63,"Drama,Western",6.1,140.0,,Fool's Gold,both,1.0


In [78]:
imdb_remainder = imdb_remainder[~imdb_remainder["tconst"].isin(all_merged_both["tconst"])]

In [82]:
def get_top_match(row):
    return process.extractOne(row.movie_name, imdb_movie_list)

In [80]:
imdb_movie_list = imdb_remainder.all_names.to_list()

In [83]:
wiki_remainder['best_match'] = wiki_remainder.apply(lambda row: get_top_match(row), axis = 1)

In [85]:
matched = matched[["franchise_id", "franchise_name", "movie_name", "tconst", "primaryTitle", "isAdult", "release_year", "runtimeMinutes", "genres", "averageRating", "numVotes"]]

In [86]:
matched

Unnamed: 0,franchise_id,franchise_name,movie_name,tconst,primaryTitle,isAdult,release_year,runtimeMinutes,genres,averageRating,numVotes
0,f0,The Aldrich Family,What a Life,tt0032123,What a Life,0.0,1939.0,75,"Comedy,Drama",6.9,94.0
1,f0,The Aldrich Family,Life with Henry,tt0033834,Life with Henry,0.0,1940.0,80,"Comedy,Family,Music",6.0,49.0
2,f0,The Aldrich Family,Henry Aldrich for President,tt0033708,Henry Aldrich for President,0.0,1941.0,75,"Comedy,Family",6.6,146.0
3,f0,The Aldrich Family,"Henry Aldrich, Editor",tt0034842,"Henry Aldrich, Editor",0.0,1942.0,72,"Comedy,Drama,Family",6.4,151.0
4,f0,The Aldrich Family,Henry and Dizzy,tt0034844,Henry and Dizzy,0.0,1942.0,71,"Comedy,Family",7.2,58.0
...,...,...,...,...,...,...,...,...,...,...,...
4097,f1343,Hopalong Cassidy (American-Western),Sunset Trail,tt0023539,Sunset Trail,0.0,1939.0,62,"Action,Music,Romance",6.2,42.0
4098,f1343,Hopalong Cassidy (American-Western),Sunset Trail,tt0030812,Sunset Trail,0.0,1939.0,69,"Drama,Western",6.9,182.0
4100,f1343,Hopalong Cassidy (American-Western),Lost Canyon,tt0034996,Lost Canyon,0.0,1943.0,61,"Drama,Western",6.2,163.0
4102,f1343,Hopalong Cassidy (American-Western),Fool's Gold,tt0038532,Fool's Gold,0.0,1947.0,63,"Drama,Western",6.1,140.0


In [87]:
wiki_remainder

Unnamed: 0,franchise_id,franchise_name,movie_name,release_year,best_match
11,f1,Coffin Joe,At Midnight I'll Take Your Soul,1963.0,"(Midnight, 90.0, 7465)"
12,f1,Coffin Joe,This Night I'll Possess Your Corpse,1967.0,"(Posse, 90.0, 46642)"
15,f1,Coffin Joe,The End of Man,1970.0,"(The End of Men, 92.85714285714286, 244599)"
16,f1,Coffin Joe,The Bloody Exorcism of Coffin Joe,1974.0,"(Blood, 90.0, 44667)"
18,f1,Coffin Joe,Hellish Flesh,1977.0,"(Flesh, 90.0, 5828)"
...,...,...,...,...,...
8175,f1344,Super Sentai,Kikai Sentai Zenkaiger vs. Kiramager vs. Senpa...,2022.0,(Kikai Sentai Zenkaiger vs Kiramager vs Senpai...
8176,f1344,Super Sentai,Avataro Sentai Donbrothers The Movie: New Firs...,2022.0,"(Brothers, 90.0, 4072)"
8177,f1344,Super Sentai,Ninpu Sentai Hurricaneger Degozaru! Shushuuto ...,2023.0,"(Hurricane, 90.0, 4688)"
8178,f1344,Super Sentai,Avataro Sentai Donbrothers vs. Zenkaiger,2023.0,"(Brothers, 90.0, 4072)"


In [None]:
matched.columns = matched.columns.str.replace('release_year', 'startYear')

In [None]:
imdb_final = pd.merge(matched, imdb_movies, on = "tconst", how = "outer")
imdb_final

In [None]:
imdb_final = imdb_final[["index", "franchise_id", "franchise_name", "movie_name", "tconst", "primaryTitle_y", "isAdult_y", "startYear_y", "runtimeMinutes_y", "genres_y", "averageRating_y", "numVotes_y"]]
imdb_final

In [None]:
imdb_final.columns = imdb_final.columns.str.replace('primaryTitle_y', 'primaryTitle')
imdb_final.columns = imdb_final.columns.str.replace('isAdult_y', 'isAdult')
imdb_final.columns = imdb_final.columns.str.replace('startYear_y', 'startYear')
imdb_final.columns = imdb_final.columns.str.replace('runtimeMinutes_y', 'runtimeMinutes')
imdb_final.columns = imdb_final.columns.str.replace('genres_y', 'genres')
imdb_final.columns = imdb_final.columns.str.replace('averageRating_y', 'averageRating')
imdb_final.columns = imdb_final.columns.str.replace('numVotes_y', 'numVotes')

In [None]:
wiki_remainder["franchise_name"].unique()

In [None]:
imdb_final.to_csv("movies_with_franchises.csv")

In [None]:
imdb_final["franchise_id"].nunique()

# Test Code

In [None]:
imdb_final[imdb_final["primaryTitle"] == "iron man"]

In [None]:
wiki_remainder[wiki_remainder["franchise_name"] == "DC Extended Universe"]

In [None]:
test = imdb_remainder.sort_values("numVotes", ascending = False).drop_duplicates(subset=["tconst"], keep="first").sort_values(["tconst"])
test[test["numVotes"] >292]

In [None]:
imdb_final[imdb_final["franchise_id"].isnull]

In [None]:
test = pd.isnull(imdb_final["franchise_id"])
imdb_final[test]["numVotes"].describe()

In [None]:
test = pd.notnull(imdb_final["franchise_id"])
imdb_final[test]["numVotes"].describe()

# Unused code

In [None]:
# apply this mask function at each step of the matching process and see how many found rows there are
list1 = list(matched["index"])
found_mask = []
for i in range(0, len(list(wiki_movies["index"]))):
    if i in list1:
        found_mask.append(True)
    else:
        found_mask.append(False)
    
#print(list1)
print(wiki_movies[found_mask])

In [None]:
both_test = matched[matched["_merge"]=="both"].sort_values(by = "index")

In [None]:
# look into moving all names into one list, then searching with date
# With remainder, search without date and see what happens

In [None]:
# losing some values in both data frames for unknown reason when using merge.
# starts with 6405 rows in wiki and 290239 rows in imdb
# 3307 
#              wiki  |  imdb 
# start     |  6405  |  290239
# found     |  3088  |    3088
# remaining |  3307  |  287199
# total     |  6395  |  290287
# variance  |   -10  |     +48