Import the necessary libraries

In [1]:
import pandas as pd
import ast # used to convert strings to lists

First, read in the imdb data and the giantbomb data as dataframes and get a count

In [2]:
imdb_games = pd.read_csv("imdb_games_db.csv", index_col = 0)
giantbomb_games = pd.read_csv("clean_giantbomb_games_db.csv", index_col = 0)

In [4]:
print("Total imdb_games: {} \nTotal giantbomb_games: {}".format(imdb_games.shape[0], giantbomb_games.shape[0]))

Total imdb_games: 14435 
Total giantbomb_games: 35117


We'll remove any exact duplicates from both dataframes

In [5]:
imdb_games = imdb_games.drop_duplicates()
giantbomb_games = giantbomb_games.drop_duplicates()
print("Total imdb_games: {} \nTotal giantbomb_games: {}".format(imdb_games.shape[0], giantbomb_games.shape[0]))

Total imdb_games: 14435 
Total giantbomb_games: 35116


Have a look at what our dataframes contain:

In [7]:
imdb_games.sample()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas
2991931,tt1369557,videoGame,Race,Race,0,2006,\N,\N,Sport,7.0,5.0,"['Race: Official WTCC Game', 'Race']"


In [8]:
giantbomb_games.sample()

Unnamed: 0,franchise_id,game_id,name,release_year,aliases,developers,genres,platforms,publishers,rating
32511,3025-783,15342,Xanadu: Dragon Slayer II,,Xanadu: Dragon Slayer 2,['Nihon Falcom Corp.'],"['Role-Playing', 'Platformer', 'Action-Adventu...","['MSX', 'PC', 'NEC PC-8801', 'NEC PC-9801', 'S...",['Nihon Falcom Corp.'],


### Remove imdb duplicates

We want to remove any duplicates from our imdb data that contain the same name and year. We save the one with the highest number of votes

In [11]:
imdb_games = imdb_games.sort_values(by=["primaryTitle", "startYear", "numVotes"])
imdb_games = imdb_games.drop_duplicates(subset=["primaryTitle", "startYear"], keep="last")
imdb_games.tconst.count()

14384

In [12]:
imdb_games.primaryTitle[imdb_games.duplicated(["primaryTitle", "startYear"], keep=False)].count()

0

### Remove giantbomb duplicates

As each game_id is unique, we want to group all the franchises for each game into a single result for each game_id. First though we check whether any of the the rows have the same game_id but a different name or release_year

In [13]:
giantbomb_games.count()

franchise_id    35116
game_id         35116
name            35116
release_year     5435
aliases          8088
developers      32746
genres          33660
platforms       34802
publishers      33604
rating          14184
dtype: int64

In [14]:
print(giantbomb_games.groupby("game_id").filter(lambda x: x["name"].nunique() > 1 or x["release_year"].nunique() > 1)["game_id"].count())

0


The next thing will be to remove any franchises which only contain 1 or 2 entries. Though they may technically count as a franchise, they do nothing to help us with analysis on the basis of longevity so we'll treat them as if they don't belong in a franchise

In [15]:
giantbomb_games.game_id.count()

35116

In [16]:
franchise_counts = giantbomb_games.groupby('franchise_id').size()
franchise_dict = {k:v for k,v in franchise_counts.items() if v < 3}
franchise_ids_to_drop = list(franchise_dict.keys())

mask = giantbomb_games['franchise_id'].isin(franchise_ids_to_drop)
giantbomb_games = giantbomb_games.drop(index=giantbomb_games[mask].index)

In [17]:
giantbomb_games.count()

franchise_id    31529
game_id         31529
name            31529
release_year     4668
aliases          7423
developers      29402
genres          30205
platforms       31251
publishers      30282
rating          13149
dtype: int64

The first thing to do is to combine all the games that have an identical franchise_id, release_year and name. With this criteria we will assume they are all the same game released on different platforms for example. We want to consider the release_year even when it is a nan value, so we convert all nan values to 0 accommodate this. Once we've done this, we group by game_id to combine the franchises for each individual game

In [18]:
giantbomb_games = giantbomb_games.groupby(["name", "franchise_id", giantbomb_games["release_year"].fillna(0)]).agg({
    "aliases": lambda x: list(set(x)),
    "developers": lambda x: list(set(x)),
    "genres": lambda x: list(set(x)),
    "platforms": lambda x: list(set(x)),
    "publishers": lambda x: list(set(x)),
    "rating": lambda x: list(set(x)),
    "game_id": "first"
}).reset_index().drop_duplicates(["game_id", "name", "franchise_id", "release_year"], keep="first")

giantbomb_games = giantbomb_games.groupby('game_id').agg({
    'franchise_id': lambda x: list(x),
    'name': 'first',
    'release_year': 'first',
    'aliases': 'first',
    'developers': 'first',
    'genres': 'first',
    "platforms": "first",
    "publishers" : "first",
    "rating": "first"
})

# Change year to string to match imdb database
giantbomb_games["release_year"] = giantbomb_games["release_year"].apply(lambda x : str(x).split('.')[0])

In [19]:
giantbomb_games.sort_values("game_id")

Unnamed: 0_level_0,franchise_id,name,release_year,aliases,developers,genres,platforms,publishers,rating
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,"[3025-143, 3025-2573]",Desert Strike: Return to the Gulf,1992,[Desert Strike Advance],"[['Electronic Arts', 'Visual Concepts', 'Budca...","[['Action', ""Shoot 'Em Up""]]","[['Amiga', 'Game Boy', 'Game Boy Advance', 'Ga...","[['Electronic Arts', 'Domark Software', 'Malib...","[['ESRB: K-A', 'ESRB: E']]"
3,[3025-2060],Hyperballoid Deluxe: Survival Pack,0,[nan],[['Kernel Kaput']],"[['Action', 'Block-Breaking']]",[['PC']],"[['Alawar Entertainment, Inc.']]",[nan]
4,[3025-1110],The Chessmaster 2000,1986,[nan],"[['The Software Toolworks', 'Software Country'...","[['Strategy', 'Trivia/Board Game']]","[['Amiga', 'Amstrad CPC', 'Apple II', 'Atari S...","[['The Software Toolworks', 'Software Country'...",[nan]
6,"[3025-128, 3025-130, 3025-1372]",WWE SmackDown! vs. RAW 2007,0,[SVR 2007],"[[""Yuke's Co. Ltd."", 'Digital Hearts Co., Ltd....","[['Action', 'Sports', 'Wrestling']]","[['PlayStation Portable', 'PlayStation 2', 'Xb...",[['THQ']],"[['ESRB: T', 'PEGI: 16+', 'CERO: C']]"
8,[3025-335],Super Spy Hunter,0,[Battle Formula],[['Tokai Engineering']],[['Vehicular Combat']],[['Nintendo Entertainment System']],[['Sunsoft']],[nan]
...,...,...,...,...,...,...,...,...,...
88822,[3025-2291],Winning Post 10,0,[nan],[nan],[['Driving/Racing']],"[['PC', 'PlayStation 4', 'Nintendo Switch', 'P...",[['Koei Tecmo']],[['CERO: A']]
88824,[3025-5696],Wan Nyan Dōbutsu Byōin,0,[nan],[nan],[['Simulation']],[['Game Boy Advance']],[['TDK Core']],[['CERO: All Ages']]
88831,[3025-5699],Shogi Saikyou: Pro ni Manabu,0,[nan],[['Magical Company']],[['Trivia/Board Game']],[['PlayStation']],[['Magical Company']],[nan]
88834,[3025-383],The Murder of Sonic the Hedgehog,0,[nan],[['Sega']],[['Adventure']],"[['Mac', 'PC']]",[['Sega']],[nan]


We can now get a record of the number of duplicate names in the data

In [20]:
giantbomb_games.name[giantbomb_games.duplicated(["name"], keep=False)].count()

560

We can add the criteria to distinguish by both name and release year, which returns the following:

In [21]:
giantbomb_games.name[giantbomb_games.duplicated(["name", "release_year"], keep=False)].count()

72

We'll combine any duplicates in giantbomb which have the same name and release year

In [41]:
# group by name and release year, aggregate the franchise_id lists
giantbomb_games = giantbomb_games.groupby(['name', 'release_year'], as_index=False).agg({
    'franchise_id': lambda x: list(set([i for l in x for i in l])),
    'name': 'first',
    'release_year': 'first',
    "aliases": 'first',
    "developers": lambda x: list(set([i for l in x for i in l])),
    "genres": lambda x: list(set([i for l in x for i in l])),
    "platforms": lambda x: list(set([i for l in x for i in l])),
    "publishers" : lambda x: list(set([i for l in x for i in l])),
    "rating": lambda x: list(set([i for l in x for i in l]))
})

The release year needs to be converted to a string so we can compare it to the imdb database. We also need to tmove trailing decimal points.

In [43]:
giantbomb_games["release_year"] = giantbomb_games["release_year"].apply(lambda x : str(x).split('.')[0])

Reset the index so we have access to the game_ids

In [44]:
giantbomb_games = giantbomb_games.reset_index()

Unfortunately, at this stage, this is the furthest we can go for dealing with duplicates. The imdb datasets do not provide any additional information that can be corroborated across the two dataframes. As such there is no way to distinguish between a duplicate in the imdb dataset if the name and year are not unique in the giantbomb dataset. From this point, we now need to remove any duplicates in the imdb dataset, where there is not a clear distinction within the giantbomb dataset. The first thing I will do is isolate the duplicates. Then drop from the duplicates dataframe anywhere we can clearly identify which version of a game that duplicate is. Once we've finished we'll be left with a small dataframe of duplicates which we have tried all methods to identify the original game and failed. Every record left in the small dataframe will be dropped from our full imdb dataframe

In [45]:
imdb_games

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas
7212109,tt4354918,videoGame,#IDARB (It Draws a Red Box),#IDARB (It Draws a Red Box),0,2015,\N,\N,Action,5.8,25.0,['#IDARB (It Draws a Red Box)']
5075818,tt1968978,videoGame,'88 Games,Hyper Sports Special,0,1988,\N,\N,Sport,6.0,14.0,"[""Track & Field '88"", 'Hyper Sports Special', ..."
367403,tt0383279,videoGame,"'Goodbye, Galaxy!' Episode IV: Secret of the O...","'Goodbye, Galaxy!' Episode IV: Secret of the O...",0,1991,\N,\N,"Action,Adventure,Sci-Fi",7.7,102.0,"[""'Goodbye, Galaxy!' Episode IV: Secret of the..."
367404,tt0383280,videoGame,"'Goodbye, Galaxy!' Episode V: The Armageddon M...","'Goodbye, Galaxy!' Episode V: The Armageddon M...",0,1991,\N,\N,"Action,Adventure,Sci-Fi",7.4,82.0,"[""'Goodbye, Galaxy!' Episode V: The Armageddon..."
4983813,tt1918633,videoGame,.detuned,.detuned,0,2009,\N,\N,Music,2.3,28.0,"['.detuned', 'detuned: Gumi senpai no fushigi ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
8014707,tt6169512,videoGame,osu!,osu!,0,2007,\N,\N,"Music,Musical",7.4,107.0,['osu!']
295707,tt0308989,videoGame,ssn,ssn,0,1996,\N,\N,\N,7.9,13.0,['ssn']
770432,tt0795512,videoGame,Æon Flux,Æon Flux,0,2005,\N,\N,"Action,Adventure,Sci-Fi",5.6,265.0,['Æon Flux']
2185219,tt1219283,videoGame,Îhatôvo monogatari,Îhatôvo monogatari,0,1993,\N,\N,Adventure,7.0,6.0,"['Ihatovo Story', 'Îhatôvo monogatari']"


In [49]:
imdb_duplicates = imdb_games[imdb_games.duplicated(["primaryTitle"], keep=False)]

In [51]:
imdb_duplicates

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas
5813970,tt2273075,videoGame,Action 52,Action 52,0,1991,\N,\N,"Action,Family,Fantasy",1.3,141.0,['Action 52']
7523132,tt5058272,videoGame,Action 52,Action 52,0,1993,\N,\N,\N,3.1,28.0,['Action 52']
8286665,tt6777472,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 1, The Search for the ...",0,2009,\N,\N,\N,5.8,15.0,"[""Adam's Venture: Episode 1, The Search for th..."
8302190,tt6813690,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 2, Solomon's Secret",0,2011,\N,\N,\N,7.5,9.0,"[""Adam's Venture: Episode 2, Solomon's Secret""..."
8302192,tt6813694,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 3, Revelations",0,2012,\N,\N,\N,5.6,9.0,"[""Adam's Venture: Origins"", ""Adam's Venture: E..."
...,...,...,...,...,...,...,...,...,...,...,...,...
5475671,tt2132358,videoGame,Zombi,ZombiU,0,2012,\N,\N,"Action,Adventure,Horror",6.3,259.0,"['ZombiU', 'Zombi', 'Killer Freaks from Outer ..."
249553,tt0260590,videoGame,Zoo Keeper,Zoo Keeper,0,1983,\N,\N,Family,7.0,21.0,"['Zoo Keeper', 'King Crab']"
436868,tt0454991,videoGame,Zoo Keeper,Zoo Keeper,0,2004,\N,\N,Action,6.1,22.0,['Zoo Keeper']
301459,tt0314957,videoGame,Zoo Tycoon,Zoo Tycoon,0,2001,\N,\N,Action,7.4,279.0,['Zoo Tycoon']


Remove any duplicate names we have a direct match for name and year.

In [48]:
matching_imdb_dups = pd.merge(imdb_games, giantbomb_games, left_on=['primaryTitle', 'startYear'], right_on=['name', 'release_year'], how='left')
matching_imdb_dups

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_x,averageRating,...,index,franchise_id,name,release_year,aliases,developers,genres_y,platforms,publishers,rating
0,tt4354918,videoGame,#IDARB (It Draws a Red Box),#IDARB (It Draws a Red Box),0,2015,\N,\N,Action,5.8,...,,,,,,,,,,
1,tt1968978,videoGame,'88 Games,Hyper Sports Special,0,1988,\N,\N,Sport,6.0,...,26.0,[3025-1053],'88 Games,1988,[Konami '88\r\nHyper Sports Special],[['Konami']],[['Track & Field']],[['Arcade']],[['Konami']],[nan]
2,tt0383279,videoGame,"'Goodbye, Galaxy!' Episode IV: Secret of the O...","'Goodbye, Galaxy!' Episode IV: Secret of the O...",0,1991,\N,\N,"Action,Adventure,Sci-Fi",7.7,...,,,,,,,,,,
3,tt0383280,videoGame,"'Goodbye, Galaxy!' Episode V: The Armageddon M...","'Goodbye, Galaxy!' Episode V: The Armageddon M...",0,1991,\N,\N,"Action,Adventure,Sci-Fi",7.4,...,,,,,,,,,,
4,tt1918633,videoGame,.detuned,.detuned,0,2009,\N,\N,Music,2.3,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14379,tt6169512,videoGame,osu!,osu!,0,2007,\N,\N,"Music,Musical",7.4,...,,,,,,,,,,
14380,tt0308989,videoGame,ssn,ssn,0,1996,\N,\N,\N,7.9,...,,,,,,,,,,
14381,tt0795512,videoGame,Æon Flux,Æon Flux,0,2005,\N,\N,"Action,Adventure,Sci-Fi",5.6,...,,,,,,,,,,
14382,tt1219283,videoGame,Îhatôvo monogatari,Îhatôvo monogatari,0,1993,\N,\N,Adventure,7.0,...,,,,,,,,,,


In [None]:
tempMask = ~imdb_duplicates['tconst'].isin(matching_imdb_dups['tconst'])
imdb_duplicates = imdb_duplicates[tempMask]
imdb_duplicates.tconst.count()

Remove any duplicates from our giantbomb database where we have no record of the year of release

In [None]:
gb_duplicate_names = giantbomb_games.duplicated(subset='name', keep=False)
release_year_int = giantbomb_games['release_year'].astype(int).eq(0)
gb_duplicates = giantbomb_games.loc[gb_duplicate_names & release_year_int]
#Having got all the duplicates with no release year, we'll drop them from the dataframe
giantbomb_games = giantbomb_games[~giantbomb_games.index.isin(gb_duplicates.index)]

In [None]:
imdb_games = imdb_games[~imdb_games['tconst'].isin(imdb_duplicates['tconst'])]
imdb_games.tconst.count()

This leaves me with 13866 games in the imdb dataset to play with. The next step is to combine the imdb dataset with the giantbomb

In [None]:
giantbomb_games.name.count()

In [None]:
giantbomb_games.name.nunique()

The way I am going to do this is to first combine the rows where the year and name matches. Then combine any remaining rows where the name matches but the year doesn't match.

In [6]:
giantbomb_games[giantbomb_games["game_id"] == 48320]

Unnamed: 0,franchise_id,game_id,name,release_year,aliases,developers,genres,platforms,publishers,rating
17033,3025-331,48320,Street Fighter V,,Street Fighter 5\r\nSFV\r\nSF5,"['Capcom', 'Dimps Corporation']",['Fighting'],"['Arcade', 'PC', 'PlayStation 4']","['Capcom', 'Sony Interactive Entertainment Ame...","['ESRB: T', 'PEGI: 12+']"


In [None]:
copy_of_imdb_games = imdb_games
copy_of_giantbomb_games = giantbomb_games
print("Total imdb:{}, Total giantbomb:{}".format(copy_of_imdb_games.tconst.count(), copy_of_giantbomb_games.name.count()))

First get all the games that have a matching title and year. Merge them and remove from the original databases so they can't be used again.

In [None]:
mergedDF = pd.DataFrame()

In [None]:
mergedDF = pd.merge(copy_of_imdb_games, copy_of_giantbomb_games, left_on=["primaryTitle", "startYear"], right_on=["name", "release_year"], how="inner")
print("Total imdb: {}, Total giantbomb: {}, Total in df: {}".format(copy_of_imdb_games.tconst.count(), copy_of_giantbomb_games.name.count(), mergedDF.tconst.count()))

Now that we've matched the title and years, we'll drop any duplicates which contain years from the giantbomb data.

In [None]:
copy_of_giantbomb_games = copy_of_giantbomb_games.drop_duplicates("name", keep=False)

In [None]:
copied_imdb_mask = copy_of_imdb_games['tconst'].isin(mergedDF['tconst'])
copied_giantbomb_mask = copy_of_giantbomb_games['game_id'].isin(mergedDF['game_id'])

In [None]:
copied_imdb_mask.value_counts()

Now we have no duplicate names, we can attempt to match on just name, first with primaryTitle, then originalTitle

In [None]:
mergedDF2 = pd.merge(copy_of_imdb_games[~copied_imdb_mask], copy_of_giantbomb_games[~copied_giantbomb_mask], left_on=["primaryTitle"], right_on=["name"], how="inner")
mergedDF = pd.concat([mergedDF, mergedDF2], axis=0)

In [None]:
copied_imdb_mask = copy_of_imdb_games['tconst'].isin(mergedDF['tconst'])
copied_giantbomb_mask = copy_of_giantbomb_games['game_id'].isin(mergedDF['game_id'])

In [None]:
copied_imdb_mask.value_counts()

In [None]:
mergedDF2 = pd.merge(copy_of_imdb_games[~copied_imdb_mask], copy_of_giantbomb_games[~copied_giantbomb_mask], left_on=["originalTitle"], right_on=["name"], how="inner")
mergedDF = pd.concat([mergedDF, mergedDF2], axis=0)

In [None]:
copied_imdb_mask = copy_of_imdb_games['tconst'].isin(mergedDF['tconst'])
copied_giantbomb_mask = copy_of_giantbomb_games['game_id'].isin(mergedDF['game_id'])

In [None]:
copied_imdb_mask.value_counts()

In [None]:
copied_giantbomb_mask.value_counts()

Now, check every possible alias in the giantbomb dataframe for a direct match.

In [None]:
#Explode the aliases
copy_of_giantbomb_games = copy_of_giantbomb_games.explode("aliases")

In [None]:
mergedDF2 = pd.merge(copy_of_imdb_games[~copied_imdb_mask], copy_of_giantbomb_games[~copied_giantbomb_mask], left_on=["primaryTitle"], right_on=["aliases"], how="inner")
mergedDF = pd.concat([mergedDF, mergedDF2], axis=0)

In [None]:
copied_imdb_mask = copy_of_imdb_games['tconst'].isin(mergedDF['tconst'])
copied_giantbomb_mask = copy_of_giantbomb_games['game_id'].isin(mergedDF['game_id'])

In [None]:
copied_imdb_mask.value_counts()

In [None]:
copied_giantbomb_mask.value_counts()

In [None]:
mergedDF2 = pd.merge(copy_of_imdb_games[~copied_imdb_mask], copy_of_giantbomb_games[~copied_giantbomb_mask], left_on=["originalTitle"], right_on=["aliases"], how="inner")
mergedDF = pd.concat([mergedDF, mergedDF2], axis=0)

In [None]:
copied_imdb_mask = copy_of_imdb_games['tconst'].isin(mergedDF['tconst'])
copied_giantbomb_mask = copy_of_giantbomb_games['game_id'].isin(mergedDF['game_id'])

In [None]:
copied_imdb_mask.value_counts()

In [None]:
copied_giantbomb_mask.value_counts()

In [None]:

copy_of_imdb_games['primaryTitle'] = copy_of_imdb_games['primaryTitle'].str.lower()
copy_of_imdb_games['originalTitle'] = copy_of_imdb_games['originalTitle'].str.lower()

copy_of_giantbomb_games['name'] = copy_of_giantbomb_games['name'].str.lower()
copy_of_giantbomb_games['aliases'] = copy_of_giantbomb_games['aliases'].str.lower()

In [None]:
mergedDF2 = pd.merge(copy_of_imdb_games[~copied_imdb_mask], copy_of_giantbomb_games[~copied_giantbomb_mask], left_on=["primaryTitle"], right_on=["name"], how="inner")
mergedDF = pd.concat([mergedDF, mergedDF2], axis=0)
copied_imdb_mask = copy_of_imdb_games['tconst'].isin(mergedDF['tconst'])
copied_giantbomb_mask = copy_of_giantbomb_games['game_id'].isin(mergedDF['game_id'])

In [None]:
mergedDF2 = pd.merge(copy_of_imdb_games[~copied_imdb_mask], copy_of_giantbomb_games[~copied_giantbomb_mask], left_on=["originalTitle"], right_on=["name"], how="inner")
mergedDF = pd.concat([mergedDF, mergedDF2], axis=0)
copied_imdb_mask = copy_of_imdb_games['tconst'].isin(mergedDF['tconst'])
copied_giantbomb_mask = copy_of_giantbomb_games['game_id'].isin(mergedDF['game_id'])
print(copied_imdb_mask.value_counts())

In [None]:
mergedDF2 = pd.merge(copy_of_imdb_games[~copied_imdb_mask], copy_of_giantbomb_games[~copied_giantbomb_mask], left_on=["primaryTitle"], right_on=["aliases"], how="inner")
mergedDF = pd.concat([mergedDF, mergedDF2], axis=0)
copied_imdb_mask = copy_of_imdb_games['tconst'].isin(mergedDF['tconst'])
copied_giantbomb_mask = copy_of_giantbomb_games['game_id'].isin(mergedDF['game_id'])
print(copied_imdb_mask.value_counts())

In [None]:
mergedDF2 = pd.merge(copy_of_imdb_games[~copied_imdb_mask], copy_of_giantbomb_games[~copied_giantbomb_mask], left_on=["originalTitle"], right_on=["aliases"], how="inner")
mergedDF = pd.concat([mergedDF, mergedDF2], axis=0)
copied_imdb_mask = copy_of_imdb_games['tconst'].isin(mergedDF['tconst'])
copied_giantbomb_mask = copy_of_giantbomb_games['game_id'].isin(mergedDF['game_id'])
print(copied_imdb_mask.value_counts())

In [None]:
import re
temp = mergedDF[mergedDF["name"].str.contains("Sherlock", na=False, flags=re.IGNORECASE, regex=True)]
temp

In [None]:
#Explode the akas

copy_of_imdb_games = copy_of_imdb_games.explode("akas")

In [None]:
copy_of_imdb_games['akas'] = copy_of_imdb_games['akas'].str.lower()

In [None]:
mergedDF2 = pd.merge(copy_of_imdb_games[~copied_imdb_mask], copy_of_giantbomb_games[~copied_giantbomb_mask], left_on=["akas"], right_on=["name"], how="inner")
mergedDF = pd.concat([mergedDF, mergedDF2], axis=0)
copied_imdb_mask = copy_of_imdb_games['tconst'].isin(mergedDF['tconst'])
copied_giantbomb_mask = copy_of_giantbomb_games['game_id'].isin(mergedDF['game_id'])
print(copied_imdb_mask.value_counts())

In [None]:
mergedDF2 = pd.merge(copy_of_imdb_games[~copied_imdb_mask], copy_of_giantbomb_games[~copied_giantbomb_mask], left_on=["akas"], right_on=["aliases"], how="inner")
mergedDF = pd.concat([mergedDF, mergedDF2], axis=0)
copied_imdb_mask = copy_of_imdb_games['tconst'].isin(mergedDF['tconst'])
copied_giantbomb_mask = copy_of_giantbomb_games['game_id'].isin(mergedDF['game_id'])
print(copied_imdb_mask.value_counts())

Create a function that combines all the possible names including akas, into a list foe each imdb entry

In [None]:
nonmatched_imdb = copy_of_imdb_games[~copied_imdb_mask]
nonmatched_gb = copy_of_giantbomb_games[~copied_giantbomb_mask]

In [None]:
def imdb_all_names(imdb_row):
    imdb_row_list = [imdb_row.primaryTitle, imdb_row.originalTitle]
    if type(imdb_row.akas) ==str:
        akas_list = ast.literal_eval(imdb_row.akas)
        for x in akas_list:
            imdb_row_list.append(x)
    return imdb_row_list

In [None]:
nonmatched_imdb['all_names'] = nonmatched_imdb.apply(lambda row: imdb_all_names(row), axis=1)
nonmatched_imdb

In [None]:
nonmatched_gb['all_names'] = nonmatched_gb.apply(lambda row: gb_all_names(row), axis=1)
nonmatched_gb

In [None]:
def gb_all_names(gb_row):
    gb_row_list = [gb_row["name"]]
    for alias in gb_row.aliases:
        if type(alias) == str:
            gb_row_list.append(alias) 
    return gb_row_list


In [None]:
nonmatched_imdb['all_names'] = nonmatched_imdb['all_names'].apply(lambda x: list(set(x)))
nonmatched_gb['all_names'] = nonmatched_gb['all_names'].apply(lambda x: list(set(x)))

In [None]:
nonmatched_imdb = nonmatched_imdb.explode("all_names")
nonmatched_gb = nonmatched_gb.explode("all_names")

In [None]:
nonmatch_gb_list = nonmatched_gb.all_names.to_list()

In [None]:
def get_top_unmatches(imdb_row):
    return process.extract(imdb_row.all_names, nonmatch_gb_list, limit = 3)

In [None]:
time_at_start = time.perf_counter()
nonmatched_imdb['best_matches'] = nonmatched_imdb.apply(lambda row: get_top_unmatches(row), axis = 1)
time_at_end = time.perf_counter()
print(time_at_end - time_at_start)

In [None]:
nonmatched_imdb

In [None]:
time_at_start = time.perf_counter()
nonmatched_imdb['best_fit'] = nonmatched_imdb.apply(lambda row: best_fit(row), axis = 1)
time_at_end = time.perf_counter()
print(time_at_end - time_at_start)

In [None]:
time_at_start = time.perf_counter()
nonmatched_imdb[['best_fit_title', 'best_fit_ratio', 'best_fit_game_id']] = nonmatched_imdb['best_fit'].apply(lambda x: pd.Series([i for i in x]))
time_at_end = time.perf_counter()
print(time_at_end - time_at_start)

In [None]:
nonmatched_imdb = nonmatched_imdb.sort_values(["best_fit_ratio"])
nonmatched_imdb

In [None]:
nonmatched_imdb["best_fit_ratio"].value_counts()

In [None]:
nonmatched_imdb = nonmatched_imdb[nonmatched_imdb['best_fit_ratio'] >= 80]
nonmatched_imdb

In [None]:
copy_of_imdb_games['all_names'] = copy_of_imdb_games.apply(lambda row: imdb_all_names(row), axis=1)
copy_of_imdb_games

Combine all giantbomb possible names including aliases.

In [None]:
def gb_all_names(gb_row):
    gb_row_list = [gb_row["name"]]
    for alias in gb_row.aliases:
        if type(alias) == str:
            gb_row_list.append(alias) 
    return gb_row_list


In [None]:
copy_of_giantbomb_games['all_names'] = copy_of_giantbomb_games.apply(lambda row: gb_all_names(row), axis=1)
copy_of_giantbomb_games

Remove any duplicates in all the name lists

In [None]:
copy_of_imdb_games['all_names'] = copy_of_imdb_games['all_names'].apply(lambda x: list(set(x)))
copy_of_giantbomb_games['all_names'] = copy_of_giantbomb_games['all_names'].apply(lambda x: list(set(x)))

In [None]:
copy_of_giantbomb_games.sample()

In [None]:
copy_of_imdb_games.drop(["best_matches", "best_fit", "best_fit_title", "best_fit_ratio", "best_fit_game_id"])

## Attempt to use rapidFuzz

This seems to be the fastest way we can fuzzymatch across the entire dataframe (that I have found). We first explode all the giantbomb names, and the imdb names. Then convert the giantbomb names to a list. We can then run rapidfuzz for all the names in our imdb dataset against the giantbomb list and append the results to a best_matches column in our imdb dataframe. Once we have this we can 

In [None]:
copy_of_imdb_games = copy_of_imdb_games[~copied_imdb_mask].explode("all_names")
copy_of_giantbomb_games = copy_of_giantbomb_games[~copied_giantbomb_mask].explode("all_names")

In [None]:
giantbomb_remaining_list = copy_of_giantbomb_games.all_names.to_list()

In [None]:
from rapidfuzz import process, fuzz

In [None]:
def get_top_matches(imdb_row):
    return process.extract(imdb_row.all_names, giantbomb_remaining_list, limit = 3)

In [None]:
import time

In [None]:
time_at_start = time.perf_counter()
copy_of_imdb_games['best_matches'] = copy_of_imdb_games.apply(lambda row: get_top_matches(row), axis = 1)
time_at_end = time.perf_counter()
print(time_at_end - time_at_start)

In [None]:
copy_of_imdb_games[copy_of_imdb_games["tconst"]=="tt0383279"]

In [None]:
print(copy_of_imdb_games[copy_of_imdb_games["tconst"]=="tt0383279"]["best_matches"].iloc[0])
print(copy_of_imdb_games[copy_of_imdb_games["tconst"]=="tt0383279"]["best_matches"].iloc[1])
print(copy_of_imdb_games[copy_of_imdb_games["tconst"]=="tt0383279"]["best_matches"].iloc[2])

In [None]:
type(copy_of_imdb_games[copy_of_imdb_games["tconst"]=="tt0383279"]["best_matches"].iloc[0][0][0])

In [None]:
def best_fit(row):
    highest = 0
    best_match = ()
    for match in row["best_matches"]:
        if match[1] > highest:
            highest = match[1]
            best_match = match
    return best_match

In [None]:
time_at_start = time.perf_counter()
copy_of_imdb_games['best_fit'] = copy_of_imdb_games.apply(lambda row: best_fit(row), axis = 1)
time_at_end = time.perf_counter()
print(time_at_end - time_at_start)

In [None]:
copy_of_imdb_games[['best_fit_title', 'best_fit_ratio', 'best_fit_game_id']] = copy_of_imdb_games['best_fit'].apply(lambda x: pd.Series([i for i in x]))


In [None]:
copy_of_imdb_games = copy_of_imdb_games[copy_of_imdb_games['best_fit_ratio'] >= 90.5]

In [None]:
copy_of_imdb_games =copy_of_imdb_games.sort_values(["tconst", "best_fit_ratio"])
copy_of_imdb_games.head(50)

In [None]:
small_imdb_exploded['best_fit'] = small_imdb_exploded.apply(lambda row: best_fit(row), axis = 1)


In [None]:
small_imdb_exploded[['best_fit_title', 'best_fit_ratio', 'best_fit_game_id']] = small_imdb_exploded['best_fit'].apply(lambda x: pd.Series([i for i in x]))

    

In [None]:
frame = copy_of_imdb_games[copy_of_imdb_games["tconst"]=="tt7025920"]
frame

In [None]:
copy_of_giantbomb_games.game_id.count()

In [None]:
copy_of_imdb_games.to_csv("copy_of_imdb_games.csv")

In [None]:
copy_of_imdb_games = pd.read_csv("copy_of_imdb_games.csv")

In [None]:
copy_of_imdb_games['best_matches'] = copy_of_imdb_games["best_matches"].apply(lambda row: ast.literal_eval(row))
type(copy_of_imdb_games["best_matches"].iloc[0])

In [None]:
copy_of_imdb_games

In [None]:
copy_of_imdb_games['best_fit'] = copy_of_imdb_games.apply(lambda row: best_fit(row), axis = 1)
copy_of_imdb_games[['best_fit_title', 'best_fit_ratio', 'best_fit_game_id']] = copy_of_imdb_games['best_fit'].apply(lambda x: pd.Series([i for i in x]))
copy_of_imdb_games = copy_of_imdb_games.sort_values("best_fit_ratio")
copy_of_imdb_games

In [None]:
copy_of_imdb_games = copy_of_imdb_games.sort_values("best_fit_ratio")

In [None]:
copy_of_imdb_games[copy_of_imdb_games["tconst"] == "tt11696274"].iloc[1]["best_matches"]

In [None]:
copy_of_giantbomb_games[copy_of_giantbomb_games.index == 18179]

In [None]:
copy_of_imdb_games[copy_of_imdb_games["tconst"] == "tt11696274"]

In [None]:
copy_of_giantbomb_games

In [None]:
mergedDF[mergedDF["game_id"] == 48320]

In [None]:
copy_of_giantbomb_games[copy_of_giantbomb_games["game_id"] == "48320"]

In [None]:
copy_of_imdb_games[(copy_of_imdb_games["best_fit_ratio"] >97) &  (copy_of_imdb_games["best_fit_ratio"] <98)].head(50)

In [None]:
import numpy as np

In [None]:
small_imdb_exploded['best_matches'] = small_imdb_exploded.apply(lambda row: get_top_matches(row), axis = 1)


In [None]:
small_imdb_exploded

In [None]:
small_imdb_exploded["best_matches"].iloc[0]

In [None]:
def withlist(imdb_row):
    results_list = process.extract(imdb_row.all_names, giantbomb_list, limit = 3)
    return results_list

In [None]:
import time
time_at_start = time.perf_counter()
small_imdb_exploded['best_matches'] = small_imdb_exploded.apply(lambda row: withlist(row), axis=1)
time_at_end = time.perf_counter()
print(time_at_end - time_at_start)

In [None]:
small_imdb_exploded

In [None]:
giantbomb_exploded.iloc[22642]

In [None]:
small_imdb_exploded.best_matches.iloc[0]

In [None]:
def rapidfuzzymatch(imdb_row):
    results_list = []
    for index, row in giantbomb_games.iterrows():
        distances  = process.cdist(row["all_names"], imdb_row.all_names, score_cutoff = 87)
        top_indices = np.argsort(distances, axis=1)
        top_values = imdb_row[top_indices]
        print(top_values)
            
            #for result in results:
                #Get tuple value if above 95
            #    if result[1]>87:
            #        results_list.append(set([row["game_id"], row["name"], result[1]]))
    return results_list

In [None]:
import time
time_at_start = time.perf_counter()
small_imdb['best_matches'] = small_imdb.apply(lambda row: rapidfuzzymatch(row), axis=1)
time_at_end = time.perf_counter()
print(time_at_end - time_at_start)

In [None]:
small_imdb.best_matches.iloc[0]

## Attempting to use fuzzy matching

The next step is to attempt to fuzzy match the titles and the original titles from the imdb dataset against the giantbomb database

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

Fuzzy match attempt

In [None]:
def fuzzymatch(imdb_row):
    results_list = []
    for index, row in giantbomb_games.iterrows():
        for item in row["all_names"]:
            results = process.extract(item, imdb_row.all_names, limit = 3)
            for result in results:
                #Get tuple value if above 95
                if result[1]>87:
                    results_list.append(set([row["game_id"], row["name"], result[1]]))
    return results_list


Testing on single entry

In [None]:
small_imdb = imdb_games[imdb_games["tconst"]== "tt7990520"]
small_imdb

In [None]:
import time
time_at_start = time.perf_counter()
small_imdb['best_matches'] = small_imdb.apply(lambda row: fuzzymatch(row), axis=1)
time_at_end = time.perf_counter()
print(time_at_end - time_at_start)

In [None]:
small_imdb.best_matches.iloc[0]

In [None]:
small_imdb

### Next attempt, explode the dataframes, then do fuzzymatch

In [None]:
small_imdb_exploded = small_imdb.explode("all_names")

In [None]:
small_imdb_exploded

In [None]:
def fuzzymatch_exploded(imdb_row):
    results_list = []
    for index, row in giantbomb_exploded.iterrows():
        results = process.extract(row["all_names"], imdb_row.all_names, limit = 3)
        for result in results:
            #Get tuple value if above 95
            if result[1]>87:
                results_list.append(set([row["game_id"], row["name"], result[1]]))
    return results_list

In [None]:
imdb_exploded = imdb_games.explode("all_names")
imdb_exploded.count()

In [None]:
giantbomb_exploded = giantbomb_games.explode("all_names")
giantbomb_exploded.count()

In [None]:
import time
time_at_start = time.perf_counter()
small_imdb['exploded_best_matches'] = small_imdb.apply(lambda row: fuzzymatch_exploded(row), axis=1)
time_at_end = time.perf_counter()
print(time_at_end - time_at_start)

## Attempting to use difflib

In [None]:
string =small_imdb_exploded.all_names.iloc[0]
close_matches = difflib.get_close_matches(string, possibilities=giantbomb_exploded['all_names'].tolist(), n=10)


In [None]:
close_matches

In [None]:
# https://stackoverflow.com/questions/56521625/quicker-way-to-perform-fuzzy-string-match-in-pandas

import difflib
from functools import partial

f = partial(difflib.get_close_matches, possibilities=giantbomb_exploded['all_names'].tolist(), n=1)

matches = small_imdb_exploded['all_names'].map(f).str[0].fillna('')
scores = [difflib.SequenceMatcher(None, x, y).ratio() 
    for x, y in zip(matches, small_imdb['all_names'])
]

small_imdb.assign(best=matches, score=scores)

# Consider exploding the all_names, then combining back but keeping the one with the best ratio?

## First attempt to merge, purely using dataframes

In [None]:
mergedDF = pd.merge(copy_of_imdb_games, copy_of_giantbomb_games, left_on=["primaryTitle", "startYear"], right_on=["name", "release_year"], how="inner")
#copy_of_imdb_games = copy_of_imdb_games[~copy_of_imdb_games['tconst'].isin(mergedDF['tconst'])]
#copy_of_giantbomb_games = copy_of_giantbomb_games[~copy_of_giantbomb_games['game_id'].isin(mergedDF['game_id'])]
print("Total imdb: {}, Total giantbomb: {}, Total in df: {}".format(copy_of_imdb_games.tconst.count(), copy_of_giantbomb_games.name.count(), mergedDF.tconst.count()))

In [None]:
Now match any where the names directly match, and the release_year is 0

In [None]:
mergedDF2 = pd.merge(copy_of_imdb_games, copy_of_giantbomb_games[copy_of_giantbomb_games['release_year'] == "0"], left_on="primaryTitle", right_on="name", how="inner")
copy_of_imdb_games = copy_of_imdb_games[~copy_of_imdb_games['tconst'].isin(mergedDF2['tconst'])]
copy_of_giantbomb_games = copy_of_giantbomb_games[~copy_of_giantbomb_games['game_id'].isin(mergedDF2['game_id'])]
print("Total imdb: {}, Total giantbomb: {}, Total in df2: {}".format(copy_of_imdb_games.tconst.count(), copy_of_giantbomb_games.name.count(), mergedDF2.tconst.count()))

In [None]:
mergedDF2[mergedDF2["primaryTitle"]=="Alice in Wonderland"]

In [None]:
mergedDF3 = pd.merge(copy_of_imdb_games, copy_of_giantbomb_games, left_on="originalTitle", right_on="name", how="inner")
copy_of_imdb_games = copy_of_imdb_games[~copy_of_imdb_games['tconst'].isin(mergedDF3['tconst'])]
copy_of_giantbomb_games = copy_of_giantbomb_games[~copy_of_giantbomb_games['game_id'].isin(mergedDF3['game_id'])]
print("Total imdb: {}, Total giantbomb: {}, Total in df3: {}".format(copy_of_imdb_games.tconst.count(), copy_of_giantbomb_games.name.count(), mergedDF3.tconst.count()))

In [None]:
mergedDF4 = pd.concat([mergedDF, mergedDF2, mergedDF3], axis=0)
mergedDF4

In [None]:
temp = imdb_duplicates[~(imdb_duplicates["primaryTitle"]== imdb_duplicates["originalTitle"])]

In [None]:
temp = pd.merge(imdb_duplicates, giantbomb_games, left_on=['primaryTitle'], right_on=['name'], how='inner')
temp

In [None]:
giantbomb_games.sort_values("game_id").head()

In [None]:
tempdb = pd.merge(imdb_duplicates, giantbomb_games, left_on=['primaryTitle'], right_on=['name'], how='inner')
tempdb

In [None]:
for index1 in imdb_duplicates.primaryTitle:
    #rint(index1)
    value = fuzz.token_sort_ratio("bction 2", index1)
    if value > 80:
        print(index1)
        print(value)

In [None]:
imdb_duplicates.head(500)

In [None]:
for index1 in imdb_duplicates.primaryTitle:
    #rint(index1)
    value = fuzz.token_sort_ratio("bction 2", index1)
    if value > 80:
        print(index1)
        print(value)

In [None]:
imdb_duplicates

In [None]:
imdb_duplicates = imdb_duplicates[imdb_duplicates['primaryTitle'].isin(giantbomb_games['name'])]
imdb_duplicates

Create dictionary with count for each duplicate

In [None]:
imdb_duplicate_counts

In [None]:
imdb_duplicate_counts = imdb_duplicates.groupby('primaryTitle').size()
imdb_duplicate_dict = {k:v for k,v in imdb_duplicate_counts.items() if v > 1}
imdb_duplicate_dict

In [None]:
giantbomb_games[giantbomb_games["name"] == "Worms"]


Note here, we are only going to consider direct matches of a name for brevity's sake. If giantbomb contains no records that match the name of an imdb duplicate, we will assume the duplicates are not part of a franchise.

In [None]:
for key, value in imdb_duplicate_dict.items():
    if giantbomb_games.name[giantbomb_games["name"] == key].count() == 0:
        # Game is not a franchise game, and can be kept in original imdb dataframe
        print(key)
        #imdb_duplicates = imdb_duplicates.drop(index=imdb_duplicates.index[imdb_duplicates['primaryTitle'] == key])
imdb_duplicates.count()

In [None]:
imdb_duplicates.count()

In [None]:
Action 52 is not in giantbomb games so we can keep in our original database as it's a unique game. Therefore we need to drop it from our dups dataframe

This leaves us with 411 duplicates, which exist in the imdb database but we may not be able to accurately match them with the 

WE ARE CREATING A DUPS DATAFRAME WHICH WILL BE ALL THE DUPLICATES WE CAN'T DISTINGUISH BETWENN. IF WE CAN DISTINGUISH, WE DROP FROM THE DUPS DATABASE. IF IMDB GAMES NAME DOES NOT APPEAR IN GIANTBOMB GAMES NAME, DROP FROM DUPLICATES LIST

sO FAR WE HAVE DONE ENOUGH. wE JUST NEED TO DO FUZZY MATCHING ON THE NAMES TO COMPARE.EG. ADAM'S VENTURE EXISTS IN BOTH BUT NEEDS FUZZY MATCHING AGAINST ORIGINAL TITLE

In [None]:
imdb_duplicates = imdb_duplicates[~imdb_duplicates['tconst'].isin(matching_imdb_dups['tconst'])]


In [None]:
for key, value in imdb_duplicate_dict.items():
    if giantbomb_games.name[giantbomb_games["name"] == key].count() == 0:
        # Game is not a franchise game, and can be kept in original imdb dataframe
        imdb_duplicates = imdb_duplicates.drop(index=imdb_duplicates.index[imdb_duplicates['primaryTitle'] == key])
    elif giantbomb_games.name[giantbomb_games["name"] == key].count() != value:
        #matching_ids holds index values of duplicates
        matching_imdb_ids=imdb_duplicates.index[imdb_duplicates['primaryTitle'] == key]
        matching_gb_ids = giantbomb_games.index[giantbomb_games['name'] == key]
        if 
        #for id in matching_imdb_ids:
            #This extracts the year
         #   year =imdb_duplicates.startYear[imdb_duplicates.index[imdb_duplicates.index == id]].item()
          #  if giantbomb_games.name[giantbomb_games["name"] == key] and giantbomb_games.release_year[giantbomb_games["release_year"] == year]:
           #     for 
            #    print("Great success!", year)
           # else:
            #    print("poo")
        
        print(key, value, giantbomb_games.name[giantbomb_games["name"] == key].count())

In [None]:
giantbomb_games[giantbomb_games.duplicated(["name"], keep=False)].count()

In [None]:
import re
temp = imdb_games[imdb_games["primaryTitle"].str.contains("Batman", na=False, flags=re.IGNORECASE, regex=True)]
temp

In [None]:
import re
temp2 = giantbomb_games[giantbomb_games["name"].str.contains("Adam's", na=False, flags=re.IGNORECASE, regex=True)]
temp2

In [None]:
duplicate_giantbomb_names = giantbomb_games[giantbomb_games.duplicated(["name", "release_year"], keep=False)]
duplicate_giantbomb_names = duplicate_giantbomb_names.sort_values("name")
duplicate_giantbomb_names

Get list of all game_ids

In [None]:
game_id_list = duplicate_giantbomb_names.index.to_list()

for each item in the list, get the corresponding json file. Clean to extract necessary data (esp. year)

In [None]:
for game_id, api_url in game_api_dict.items():
    filename = "{}.json".format(game_id)
    if not Path(game_dir, filename).is_file():

        

In [None]:
print(len(release_years))

In [None]:
num_nans = id_year_df.isna().sum().sum()
print(num_nans)

In [None]:
# group the data by name and count the number of occurrences
grouped = giantbomb_games.groupby(['name']).size().reset_index(name='count')

# select the rows where count > 1, indicating a duplicate name
duplicates = grouped[grouped['count'] > 1]

# select the rows with duplicate name and different game_id
duplicates = giantbomb_games[giantbomb_games['name'].isin(duplicates['name'])].groupby('name').filter(lambda x: x['game_id'].nunique() > 1)

# print the duplicate rows, sorted by name
if len(duplicates) > 0:
    print("Found {} rows with same name but different game_id:".format(len(duplicates)))
    print(duplicates.sort_values(by=['name', 'game_id']))
else:
    print("No rows with same name but different game_id found.")


In [None]:
temp2 = temp[temp.duplicated(["name", "index"], keep=False)]
temp2 = temp2.sort_values("name")
temp2

In [None]:
a = giantbomb_games[giantbomb_games["name"].str.contains('Sqrxz', na=False, flags=re.IGNORECASE, regex=True)]
a

The first thing I am going to do is check for any titles that appear only once in both dataframes and that have matching titles. With this criteria, we can assume that the games are the same. 

In [None]:
unmatching_rows = imdb_games[~imdb_games['primaryTitle'].isin(giantbomb_games['name'])]

In [None]:
try_this = unmatching_rows[unmatching_rows['originalTitle'].isin(giantbomb_games['name'])]
try_this.count()

In [None]:
import re
temp = giantbomb_games[giantbomb_games["name"].str.contains('Infernal', na=False, flags=re.IGNORECASE, regex=True)]
temp.head(50)

My current plan for the most accurate way to approach this:

First check for duplicate names in both datasets. If there is a duplicate name in either dataset, we need to check whether this is because there are 2 games with the same name. If this is the case, we need a way to distinguish between the 2. I would anticipate year of release could work.

The next step would be comparing names against alternate titles. After this, any that still have no matches, would be subject to fuzzy matching.I'd anticipate at least 75% of the games in the imdb dataset should be contained in giantbomb's database.

In [None]:
imdb_dups = imdb_games[imdb_games.duplicated(['primaryTitle', 'startYear'], keep=False)]
imdb_dups = imdb_dups.sort_values("primaryTitle")
print(imdb_dups)