In [1]:
import pandas as pd
import re
import ast
import numpy as np

from rapidfuzz import process, fuzz

In [2]:
wiki_movies = pd.read_csv("wikipedia_movie_franchises.csv", index_col = 0)
imdb_movies = pd.read_csv("imdb_movies_db.csv", index_col = 0)

In [3]:
wiki_movies = wiki_movies.dropna(axis=0, how='any')
wiki_movies.reset_index(inplace = True)
wiki_movies["release_year"] = wiki_movies["release_year"].apply(lambda x : int(x))

In [4]:
wiki_movies["movie_name"] = wiki_movies["movie_name"].apply(lambda x : x.strip())
wiki_movies["franchise_name"] = wiki_movies["franchise_name"].apply(lambda x : x.strip())

In [5]:
wiki_movies["movie_name"] = wiki_movies["movie_name"].apply(lambda x : x.replace("–", "-"))
wiki_movies["movie_name"] = wiki_movies["movie_name"].apply(lambda x : x.lower())

In [6]:
def lower_akas(row):
    if type(row) == str:
        row = row.lower()
        return row

In [7]:
imdb_movies["primaryTitle"] = imdb_movies["primaryTitle"].apply(lambda x : x.lower())
imdb_movies["originalTitle"] = imdb_movies["originalTitle"].apply(lambda x : x.lower())
imdb_movies["akas"] = imdb_movies["akas"].apply(lambda x : lower_akas(x))
imdb_movies["startYear"] = imdb_movies["startYear"].apply(lambda x : int(x) if x != "\\N" else np.nan)

In [8]:
wiki_filter = ["tconst", "titleType", "primaryTitle","originalTitle", "isAdult","startYear","endYear","runtimeMinutes","genres","averageRating","numVotes","akas","_merge", "all_names"]
imdb_filter = ["index", "franchise_name", "franchise_id", "movie_name", "release_year","_merge"]

In [9]:
imdb_copy = imdb_movies.copy()

In [10]:
imdb_copy

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas
8,tt0000009,movie,miss jerry,miss jerry,0,1894.0,\N,45,Romance,5.3,205.0,"['miss jerry', 'fräulein jerry']"
144,tt0000147,movie,the corbett-fitzsimmons fight,the corbett-fitzsimmons fight,0,1897.0,\N,100,"Documentary,News,Sport",5.3,469.0,"['the corbett-fitzsimmons fight', 'бой корбетт..."
498,tt0000502,movie,bohemios,bohemios,0,1905.0,\N,100,\N,4.1,15.0,['bohemios']
570,tt0000574,movie,the story of the kelly gang,the story of the kelly gang,0,1906.0,\N,70,"Action,Adventure,Biography",6.0,823.0,"['the story of the kelly gang', 'kelly bandájá..."
587,tt0000591,movie,the prodigal son,l'enfant prodigue,0,1907.0,\N,90,Drama,4.4,20.0,"[""l'enfant prodigue"", 'the prodigal son']"
...,...,...,...,...,...,...,...,...,...,...,...,...
9724870,tt9916270,movie,il talento del calabrone,il talento del calabrone,0,2020.0,\N,84,Thriller,5.8,1414.0,"['il talento del calabrone', 'the talent of th..."
9724915,tt9916362,movie,coven,akelarre,0,2020.0,\N,92,"Drama,History",6.4,5293.0,"['boszorkánygyülekezet', 'coven', ""les sorcièr..."
9724947,tt9916428,movie,the secret of china,hong xing zhao yao zhong guo,0,2019.0,\N,\N,"Adventure,History,War",3.8,14.0,"['the secret of china', 'hong xing zhao yao zh..."
9724999,tt9916538,movie,kuambil lagi hatiku,kuambil lagi hatiku,0,2019.0,\N,123,Drama,8.6,7.0,['kuambil lagi hatiku']


In [11]:
def filter_df(df, merge, cols):
    remainder = df[df["_merge"]== merge]
    remainder = remainder.drop(cols, axis = 1)
    return remainder

In [12]:
def extract_akas(row):
    # convert string in list format to list
    if type(row) == str:
        akas = ast.literal_eval(row)
        return akas

In [13]:
imdb_copy["akas"] = imdb_copy["akas"].apply(lambda x : extract_akas(x))

In [14]:
imdb_copy['all_names'] = np.empty((len(imdb_copy), 0)).tolist()

In [15]:
imdb_copy = imdb_copy.reset_index(drop=True)

In [16]:
for i in range(0, len(list(imdb_copy["primaryTitle"]))):
    imdb_copy['all_names'][i].append(imdb_copy['primaryTitle'][i])
    imdb_copy['all_names'][i].append(imdb_copy['originalTitle'][i])
    
    if type(imdb_copy['akas'][i]) == list:
        for x in imdb_copy['akas'][i]:
            imdb_copy['all_names'][i].append(x)
        
        
imdb_copy

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas,all_names
0,tt0000009,movie,miss jerry,miss jerry,0,1894.0,\N,45,Romance,5.3,205.0,"[miss jerry, fräulein jerry]","[miss jerry, miss jerry, miss jerry, fräulein ..."
1,tt0000147,movie,the corbett-fitzsimmons fight,the corbett-fitzsimmons fight,0,1897.0,\N,100,"Documentary,News,Sport",5.3,469.0,"[the corbett-fitzsimmons fight, бой корбетта и...","[the corbett-fitzsimmons fight, the corbett-fi..."
2,tt0000502,movie,bohemios,bohemios,0,1905.0,\N,100,\N,4.1,15.0,[bohemios],"[bohemios, bohemios, bohemios]"
3,tt0000574,movie,the story of the kelly gang,the story of the kelly gang,0,1906.0,\N,70,"Action,Adventure,Biography",6.0,823.0,"[the story of the kelly gang, kelly bandájának...","[the story of the kelly gang, the story of the..."
4,tt0000591,movie,the prodigal son,l'enfant prodigue,0,1907.0,\N,90,Drama,4.4,20.0,"[l'enfant prodigue, the prodigal son]","[the prodigal son, l'enfant prodigue, l'enfant..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
290234,tt9916270,movie,il talento del calabrone,il talento del calabrone,0,2020.0,\N,84,Thriller,5.8,1414.0,"[il talento del calabrone, the talent of the h...","[il talento del calabrone, il talento del cala..."
290235,tt9916362,movie,coven,akelarre,0,2020.0,\N,92,"Drama,History",6.4,5293.0,"[boszorkánygyülekezet, coven, les sorcières d'...","[coven, akelarre, boszorkánygyülekezet, coven,..."
290236,tt9916428,movie,the secret of china,hong xing zhao yao zhong guo,0,2019.0,\N,\N,"Adventure,History,War",3.8,14.0,"[the secret of china, hong xing zhao yao zhong...","[the secret of china, hong xing zhao yao zhong..."
290237,tt9916538,movie,kuambil lagi hatiku,kuambil lagi hatiku,0,2019.0,\N,123,Drama,8.6,7.0,[kuambil lagi hatiku],"[kuambil lagi hatiku, kuambil lagi hatiku, kua..."


In [17]:
imdb_copy = imdb_copy.explode("all_names")

In [18]:
all_merged = pd.merge(wiki_movies, imdb_copy, left_on=["movie_name", "release_year"], right_on=["all_names", "startYear"], how="outer", indicator = True)
wiki_remainder = filter_df(all_merged, "left_only", wiki_filter)
imdb_remainder = filter_df(all_merged, "right_only", imdb_filter)
all_merged[all_merged["_merge"]=="both"]

Unnamed: 0,index,franchise_id,franchise_name,movie_name,release_year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas,all_names,_merge
0,0.0,f0,The Aldrich Family,what a life,1939.0,tt0032123,movie,what a life,what a life,0.0,1939.0,\N,75,"Comedy,Drama",6.9,93.0,"[what a life, a vida começa aos 14]",what a life,both
1,0.0,f0,The Aldrich Family,what a life,1939.0,tt0032123,movie,what a life,what a life,0.0,1939.0,\N,75,"Comedy,Drama",6.9,93.0,"[what a life, a vida começa aos 14]",what a life,both
2,0.0,f0,The Aldrich Family,what a life,1939.0,tt0032123,movie,what a life,what a life,0.0,1939.0,\N,75,"Comedy,Drama",6.9,93.0,"[what a life, a vida começa aos 14]",what a life,both
3,1.0,f0,The Aldrich Family,life with henry,1940.0,tt0033834,movie,life with henry,life with henry,0.0,1940.0,\N,80,"Comedy,Family,Music",6.0,49.0,"[life with henry, henry quería ir a alaska, he...",life with henry,both
4,1.0,f0,The Aldrich Family,life with henry,1940.0,tt0033834,movie,life with henry,life with henry,0.0,1940.0,\N,80,"Comedy,Family,Music",6.0,49.0,"[life with henry, henry quería ir a alaska, he...",life with henry,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16309,8163.0,f1343,Super Sentai,super hero taisen gp: kamen rider 3,2015.0,tt4282466,movie,super hero taisen gp: kamen rider 3,super hero taisen gp: kamen rider 3,0.0,2015.0,\N,95,Action,6.5,85.0,"[super hero taisen gp: kamen rider 3, スーパーヒーロー...",super hero taisen gp: kamen rider 3,both
16322,8176.0,f1343,Super Sentai,kishiryu sentai ryusoulger special chapter: me...,2021.0,tt13681618,movie,kishiryu sentai ryusoulger special chapter: me...,kishiryuu sentai ryuusoujâ tokubetsuhen: memor...,0.0,2021.0,\N,15,"Action,Fantasy",6.4,6.0,[kishiryuu sentai ryuusoujâ tokubetsuhen: memo...,kishiryu sentai ryusoulger special chapter: me...,both
16323,8176.0,f1343,Super Sentai,kishiryu sentai ryusoulger special chapter: me...,2021.0,tt13681618,movie,kishiryu sentai ryusoulger special chapter: me...,kishiryuu sentai ryuusoujâ tokubetsuhen: memor...,0.0,2021.0,\N,15,"Action,Fantasy",6.4,6.0,[kishiryuu sentai ryuusoujâ tokubetsuhen: memo...,kishiryu sentai ryusoulger special chapter: me...,both
16326,8179.0,f1343,Super Sentai,kaizoku sentai: ten gokaiger,2021.0,tt14879560,movie,kaizoku sentai: ten gokaiger,kaizoku sentai ten gôkaijâ,0.0,2021.0,\N,61,"Action,Adventure,Comedy",8.6,73.0,"[kaizoku sentai ten gokaiger, kaizoku sentai t...",kaizoku sentai: ten gokaiger,both


In [19]:
all_merged_both = all_merged[all_merged["_merge"]=="both"]
all_merged_both = all_merged_both.drop_duplicates(subset = ["index", "tconst"],keep = "first")
all_merged_both

Unnamed: 0,index,franchise_id,franchise_name,movie_name,release_year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas,all_names,_merge
0,0.0,f0,The Aldrich Family,what a life,1939.0,tt0032123,movie,what a life,what a life,0.0,1939.0,\N,75,"Comedy,Drama",6.9,93.0,"[what a life, a vida começa aos 14]",what a life,both
3,1.0,f0,The Aldrich Family,life with henry,1940.0,tt0033834,movie,life with henry,life with henry,0.0,1940.0,\N,80,"Comedy,Family,Music",6.0,49.0,"[life with henry, henry quería ir a alaska, he...",life with henry,both
6,2.0,f0,The Aldrich Family,henry aldrich for president,1941.0,tt0033708,movie,henry aldrich for president,henry aldrich for president,0.0,1941.0,\N,75,"Comedy,Family",6.6,146.0,"[henry aldrich para presidente, henry aldrich ...",henry aldrich for president,both
9,3.0,f0,The Aldrich Family,"henry aldrich, editor",1942.0,tt0034842,movie,"henry aldrich, editor","henry aldrich, editor",0.0,1942.0,\N,72,"Comedy,Drama,Family",6.4,150.0,"[henry periodista, henry aldrich, editor]","henry aldrich, editor",both
12,4.0,f0,The Aldrich Family,henry and dizzy,1942.0,tt0034844,movie,henry and dizzy,henry and dizzy,0.0,1942.0,\N,71,"Comedy,Family",7.2,58.0,[henry and dizzy],henry and dizzy,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16299,8157.0,f1343,Super Sentai,kamen rider × super sentai × space sheriff: su...,2013.0,tt2632184,movie,kamen rider × super sentai × space sheriff: su...,kamen raidâ × sûpâ sentai × uchû keiji: supâ h...,0.0,2013.0,\N,92,"Action,Adventure,Comedy",6.3,129.0,[kamen raidâ × sûpâ sentai × uchû keiji: supâ ...,kamen rider × super sentai × space sheriff: su...,both
16305,8162.0,f1343,Super Sentai,ressha sentai toqger vs. kyoryuger: the movie,2015.0,tt4152148,movie,ressha sentai toqger vs. kyoryuger: the movie,ressha sentai tokkyûjâ tai kyôryûjâ za mûbî,0.0,2015.0,\N,64,Action,7.5,32.0,"[ressha sentai tokkyûjâ tai kyôryûjâ za mûbî, ...",ressha sentai toqger vs. kyoryuger: the movie,both
16307,8163.0,f1343,Super Sentai,super hero taisen gp: kamen rider 3,2015.0,tt4282466,movie,super hero taisen gp: kamen rider 3,super hero taisen gp: kamen rider 3,0.0,2015.0,\N,95,Action,6.5,85.0,"[super hero taisen gp: kamen rider 3, スーパーヒーロー...",super hero taisen gp: kamen rider 3,both
16322,8176.0,f1343,Super Sentai,kishiryu sentai ryusoulger special chapter: me...,2021.0,tt13681618,movie,kishiryu sentai ryusoulger special chapter: me...,kishiryuu sentai ryuusoujâ tokubetsuhen: memor...,0.0,2021.0,\N,15,"Action,Fantasy",6.4,6.0,[kishiryuu sentai ryuusoujâ tokubetsuhen: memo...,kishiryu sentai ryusoulger special chapter: me...,both


In [20]:
imdb_remainder = imdb_remainder[~imdb_remainder["tconst"].isin(all_merged_both["tconst"])]

In [21]:
no_year = pd.merge(wiki_remainder, imdb_remainder, left_on=["movie_name"], right_on=["all_names"], how="outer", indicator = True)
wiki_remainder = filter_df(all_merged, "left_only", wiki_filter)
imdb_remainder = filter_df(all_merged, "right_only", imdb_filter)
no_year

Unnamed: 0,index,franchise_id,franchise_name,movie_name,release_year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas,all_names,_merge
0,11.0,f1,Coffin Joe,at midnight i'll take your soul,1963.0,,,,,,,,,,,,,,left_only
1,15.0,f1,Coffin Joe,the end of man,1970.0,tt0067099,movie,finis hominis,finis hominis,0.0,1971.0,\N,79,"Comedy,Drama,Mystery",5.7,373.0,"[end of man, the end of man, finis hominis, en...",the end of man,both
2,26.0,f2,The Crime Club,the last express,1938.0,tt0037776,movie,the hidden eye,the hidden eye,0.0,1945.0,\N,69,"Action,Crime,Mystery",6.2,397.0,"[the hidden eye, perfume do oriente, l'oeil ca...",the last express,both
3,43.0,f3,Fast & Furious,fast x,2023.0,,,,,,,,,,,,,,left_only
4,48.0,f4,Gingerdead Man vs. Evil Bong,evil bong 3d: the wrath of bong,2011.0,,,,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1842399,,,,,,tt9916538,movie,kuambil lagi hatiku,kuambil lagi hatiku,0.0,2019.0,\N,123,Drama,8.6,7.0,[kuambil lagi hatiku],kuambil lagi hatiku,right_only
1842400,,,,,,tt9916730,movie,6 gunn,6 gunn,0.0,2017.0,\N,116,\N,8.3,10.0,"[६ गुण, 6 gunn]",6 gunn,right_only
1842401,,,,,,tt9916730,movie,6 gunn,6 gunn,0.0,2017.0,\N,116,\N,8.3,10.0,"[६ गुण, 6 gunn]",6 gunn,right_only
1842402,,,,,,tt9916730,movie,6 gunn,6 gunn,0.0,2017.0,\N,116,\N,8.3,10.0,"[६ गुण, 6 gunn]",6 gunn,right_only


In [22]:
no_year_both = no_year[no_year["_merge"]=="both"]
no_year_both = no_year_both.drop_duplicates(subset = ["index", "tconst"],keep = "first")
no_year_both["tconst"].nunique()

927

In [23]:
no_year_both['difference_in_years'] = no_year_both.apply(lambda x: abs(x['startYear'] - x['release_year']), axis=1)
no_year_both = no_year_both[no_year_both["difference_in_years"] <= 10]
no_year_both["tconst"].nunique()

465

In [24]:
no_year_both.head(50)

Unnamed: 0,index,franchise_id,franchise_name,movie_name,release_year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas,all_names,_merge,difference_in_years
1,15.0,f1,Coffin Joe,the end of man,1970.0,tt0067099,movie,finis hominis,finis hominis,0.0,1971.0,\N,79,"Comedy,Drama,Mystery",5.7,373.0,"[end of man, the end of man, finis hominis, en...",the end of man,both,1.0
2,26.0,f2,The Crime Club,the last express,1938.0,tt0037776,movie,the hidden eye,the hidden eye,0.0,1945.0,\N,69,"Action,Crime,Mystery",6.2,397.0,"[the hidden eye, perfume do oriente, l'oeil ca...",the last express,both,7.0
73,185.0,f16,Young and Dangerous,those were the days,2000.0,tt0114146,movie,those were the days,le plus bel âge...,0.0,1995.0,\N,85,Drama,6.5,208.0,"[a mais bela idade, those were the days, najpi...",those were the days,both,5.0
75,185.0,f16,Young and Dangerous,those were the days,2000.0,tt0118165,movie,those were the days,wong gok dik tin hung 2: nam siu yee,0.0,1996.0,\N,89,"Action,Crime,Drama",5.1,17.0,"[those were the days, wong gok dik tin hung 2:...",those were the days,both,4.0
77,185.0,f16,Young and Dangerous,those were the days,2000.0,tt0186543,movie,si ge 32a he yi ge xiang jiao shao nian,si ge 32a he yi ge xiang jiao shao nian,0.0,1996.0,\N,101,Drama,6.6,32.0,"[si ge 32a he yi ge xiang jiao shao nian, thos...",those were the days,both,4.0
78,185.0,f16,Young and Dangerous,those were the days,2000.0,tt0285244,movie,those were the days,jing zhuang nan xiong nan di,0.0,1997.0,\N,103,Comedy,6.5,140.0,"[those were the days, 精裝難兄難弟, 精装难兄难弟, jing zhu...",those were the days,both,3.0
119,234.0,f20,Gamera,gamera the brave,2006.0,tt0467923,movie,gamera the brave,chiisaki yûsha-tachi: gamera,0.0,2005.0,\N,96,"Action,Adventure,Family",6.6,1298.0,"[гамера: маленькие герои, gamera: o genaios, 小...",gamera the brave,both,1.0
121,237.0,f21,The Hombre Lobo Series (a.k.a. The Waldemar Da...,los monstruos del terror,1969.0,tt0064687,movie,assignment terror,los monstruos del terror,0.0,1970.0,\N,87,"Horror,Sci-Fi",4.1,950.0,"[operación terror, dracula jagt frankenstein, ...",los monstruos del terror,both,1.0
123,238.0,f21,The Hombre Lobo Series (a.k.a. The Waldemar Da...,la furia del hombre lobo,1970.0,tt0065750,movie,fury of the wolfman,la furia del hombre lobo,0.0,1972.0,\N,86,Horror,3.7,875.0,"[varulven, ярость оборотня, fury of the wolfma...",la furia del hombre lobo,both,2.0
126,239.0,f21,The Hombre Lobo Series (a.k.a. The Waldemar Da...,la noche de walpurgis,1970.0,tt0066160,movie,the werewolf versus the vampire woman,la noche de walpurgis,0.0,1971.0,\N,86,Horror,5.3,1876.0,"[satan vs. the wolf man, la noche de walpurgis...",la noche de walpurgis,both,1.0


In [25]:
matched = pd.concat([all_merged_both, no_year_both])
matched = matched[matched["_merge"]=="both"]
matched.sort_values("numVotes", ascending = False).drop_duplicates(subset=["index", "franchise_id"], keep="first").sort_values(["index"])
matched

Unnamed: 0,index,franchise_id,franchise_name,movie_name,release_year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas,all_names,_merge,difference_in_years
0,0.0,f0,The Aldrich Family,what a life,1939.0,tt0032123,movie,what a life,what a life,0.0,1939.0,\N,75,"Comedy,Drama",6.9,93.0,"[what a life, a vida começa aos 14]",what a life,both,
3,1.0,f0,The Aldrich Family,life with henry,1940.0,tt0033834,movie,life with henry,life with henry,0.0,1940.0,\N,80,"Comedy,Family,Music",6.0,49.0,"[life with henry, henry quería ir a alaska, he...",life with henry,both,
6,2.0,f0,The Aldrich Family,henry aldrich for president,1941.0,tt0033708,movie,henry aldrich for president,henry aldrich for president,0.0,1941.0,\N,75,"Comedy,Family",6.6,146.0,"[henry aldrich para presidente, henry aldrich ...",henry aldrich for president,both,
9,3.0,f0,The Aldrich Family,"henry aldrich, editor",1942.0,tt0034842,movie,"henry aldrich, editor","henry aldrich, editor",0.0,1942.0,\N,72,"Comedy,Drama,Family",6.4,150.0,"[henry periodista, henry aldrich, editor]","henry aldrich, editor",both,
12,4.0,f0,The Aldrich Family,henry and dizzy,1942.0,tt0034844,movie,henry and dizzy,henry and dizzy,0.0,1942.0,\N,71,"Comedy,Family",7.2,58.0,[henry and dizzy],henry and dizzy,both,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4941,8069.0,f1342,Hopalong Cassidy (American-Western),sunset trail,1939.0,tt0023539,movie,sunset trail,sunset trail,0.0,1932.0,\N,62,"Action,Music,Romance",6.2,42.0,"[la fattoria maledetta, to monopati tou thanat...",sunset trail,both,7.0
4944,8069.0,f1342,Hopalong Cassidy (American-Western),sunset trail,1939.0,tt0030812,movie,sunset trail,sunset trail,0.0,1938.0,\N,69,Western,6.9,180.0,"[gentleman-cowboy, äventyret i silver city, ro...",sunset trail,both,1.0
4948,8093.0,f1342,Hopalong Cassidy (American-Western),lost canyon,1943.0,tt0034996,movie,lost canyon,lost canyon,0.0,1942.0,\N,61,Western,6.3,161.0,"[striden i dödsdalen, den sorte rytter, desfil...",lost canyon,both,1.0
4955,8103.0,f1342,Hopalong Cassidy (American-Western),fool's gold,1947.0,tt0038532,movie,fool's gold,fool's gold,0.0,1946.0,\N,63,"Drama,Western",6.1,140.0,"[överfallet på guldtransporten, twin buttes, r...",fool's gold,both,1.0


In [26]:
imdb_remainder = imdb_remainder[~imdb_remainder["tconst"].isin(all_merged_both["tconst"])]

In [27]:
def get_top_matches(row):
    return process.extract(row.movie_name, imdb_movie_list, limit = 3)

In [28]:
imdb_movie_list = imdb_remainder.all_names.to_list()

In [29]:
wiki_remainder['best_match'] = wiki_remainder.apply(lambda row: get_top_matches(row), axis = 1)

KeyboardInterrupt: 

In [None]:
matched = matched[["index", "franchise_id", "franchise_name", "movie_name", "tconst", "primaryTitle", "isAdult", "release_year", "runtimeMinutes", "genres", "averageRating", "numVotes"]]

In [None]:
matched.columns = matched.columns.str.replace('release_year', 'startYear')

In [None]:
matched["startYear"] = matched["startYear"].apply(lambda x : int(x))

In [None]:
imdb_final = pd.merge(matched, imdb_movies, on = "tconst", how = "outer")
imdb_final

In [None]:
imdb_final = imdb_final[["index", "franchise_id", "franchise_name", "movie_name", "tconst", "primaryTitle_y", "isAdult_y", "startYear_y", "runtimeMinutes_y", "genres_y", "averageRating_y", "numVotes_y"]]
imdb_final

In [None]:
imdb_final.columns = imdb_final.columns.str.replace('primaryTitle_y', 'primaryTitle')
imdb_final.columns = imdb_final.columns.str.replace('isAdult_y', 'isAdult')
imdb_final.columns = imdb_final.columns.str.replace('startYear_y', 'startYear')
imdb_final.columns = imdb_final.columns.str.replace('runtimeMinutes_y', 'runtimeMinutes')
imdb_final.columns = imdb_final.columns.str.replace('genres_y', 'genres')
imdb_final.columns = imdb_final.columns.str.replace('averageRating_y', 'averageRating')
imdb_final.columns = imdb_final.columns.str.replace('numVotes_y', 'numVotes')

In [None]:
imdb_final[imdb_final["primaryTitle"] == "iron man"]

In [None]:
wiki_remainder["franchise_name"].unique()

In [None]:
test = imdb_copy[imdb_copy["primaryTitle"].str.contains("harry potter and the deathly", na=False, flags=re.IGNORECASE, regex=True)]
test

In [None]:
"Wizarding World" 

In [None]:
wiki_remainder[wiki_remainder["franchise_name"] == "DC Extended Universe"]

In [None]:
imdb_final.to_csv("movies_with_franchises.csv")

# Unused code

In [None]:
list1 = list(matched["index"])
found_mask = []
for i in range(0, len(list(wiki_movies["index"]))):
    if i in list1:
        found_mask.append(True)
    else:
        found_mask.append(False)
    
#print(list1)
print(wiki_movies[found_mask])

In [None]:
# apply this mask function at each step of the matching process and see how many found rows there are

In [None]:
3450 + 4749

In [None]:
matched["index"].value_counts()

In [None]:
both_test = matched[matched["_merge"]=="both"].sort_values(by = "index")

In [None]:
dict(both_test["franchise_name"].value_counts())

In [None]:
# look into moving all names into one list, then searching with date
# With remainder, search without date and see what happens

In [None]:
# losing some values in both data frames for unknown reason when using merge.
# starts with 6405 rows in wiki and 290239 rows in imdb
# 3307 
#              wiki  |  imdb 
# start     |  6405  |  290239
# found     |  3088  |    3088
# remaining |  3307  |  287199
# total     |  6395  |  290287
# variance  |   -10  |     +48

### Checks for movies in dataframes

In [None]:
test = both_test[both_test["franchise_name"].str.contains("Edgar Wallace Mysteries", na=False, flags=re.IGNORECASE, regex=True)]
test.head(50)

In [None]:
test = wiki_movies[wiki_movies["franchise_name"].str.contains("Edgar Wallace Mysteries", na=False, flags=re.IGNORECASE, regex=True)]
test

In [None]:
test = imdb_movies[imdb_movies["primaryTitle"].str.contains("star wars", na=False, flags=re.IGNORECASE, regex=True)]
test

In [None]:
test = matched[matched["index"] == 5191.0]
test

In [None]:
test = imdb_copy[imdb_copy["tconst"].str.contains("tt5433140", na=False, flags=re.IGNORECASE, regex=True)]
test.head()


In [None]:
imdb_movies['titles'] = imdb_movies.apply(', '.join(imdb_movies.columns["primaryTitle", "originalTitle", "akas"]), axis = 1)
imdb_movies

In [None]:
def fuzzy_match(wiki, imdb, threshold):
    # iterate through each row in wiki
    # 
    # 
    # 
    # 
    # 
    # 
    # 

In [None]:
fuzzy_match(wiki_movies, imdb_movies, 95)

In [None]:
wiki_movies[wiki_movies["franchise_id"] == "f1312"]

In [None]:
imdb_movies[imdb_movies["tconst"] == "tt3262112"]