In [75]:
import pandas as pd
from pathlib import Path

First, read in the imdb data and the giantbomb data as dataframes and get a count

In [76]:
imdb_games = pd.read_csv("imdb_games_db.csv", index_col = 0)
giantbomb_games = pd.read_csv("clean_giantbomb_games_db.csv", index_col = 0)

In [3]:
print("Total imdb_games: {} \nTotal giantbomb_games: {}".format(imdb_games.shape[0], giantbomb_games.shape[0]))

Total imdb_games: 14435 
Total giantbomb_games: 35117


We'll remove any exact duplicates from both dataframes

In [77]:
imdb_games = imdb_games.drop_duplicates()
giantbomb_games = giantbomb_games.drop_duplicates()
print("Total imdb_games: {} \nTotal giantbomb_games: {}".format(imdb_games.shape[0], giantbomb_games.shape[0]))

Total imdb_games: 14435 
Total giantbomb_games: 35116


Have a look at what our dataframes contain:

In [5]:
imdb_games.sample()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas
1631306,tt11191330,videoGame,Asterix,Asterix,0,1983,\N,\N,Action,5.5,7.0,['Asterix']


In [6]:
giantbomb_games.sample()

Unnamed: 0,franchise_id,game_id,name,release_year,aliases,developers,genres,platforms,publishers,rating
32102,3025-736,9228,Parasite Eve,,PE,"['Squaresoft', 'Square Visual Works']","['Action', 'Adventure', 'Role-Playing']","['PlayStation', 'PlayStation Network (PS3)', '...","['Squaresoft', 'Square EA']",['ESRB: M']


### Remove imdb duplicates

We want to remove any duplicates from our imdb data that contain the same name and year. We save the one with the highest number of votes

In [78]:
imdb_games = imdb_games.sort_values(by=["primaryTitle", "startYear", "numVotes"])
imdb_games = imdb_games.drop_duplicates(subset=["primaryTitle", "startYear"], keep="last")
imdb_games

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas
7212109,tt4354918,videoGame,#IDARB (It Draws a Red Box),#IDARB (It Draws a Red Box),0,2015,\N,\N,Action,5.8,25.0,['#IDARB (It Draws a Red Box)']
5075818,tt1968978,videoGame,'88 Games,Hyper Sports Special,0,1988,\N,\N,Sport,6.0,14.0,"[""Track & Field '88"", 'Hyper Sports Special', ..."
367403,tt0383279,videoGame,"'Goodbye, Galaxy!' Episode IV: Secret of the O...","'Goodbye, Galaxy!' Episode IV: Secret of the O...",0,1991,\N,\N,"Action,Adventure,Sci-Fi",7.7,102.0,"[""'Goodbye, Galaxy!' Episode IV: Secret of the..."
367404,tt0383280,videoGame,"'Goodbye, Galaxy!' Episode V: The Armageddon M...","'Goodbye, Galaxy!' Episode V: The Armageddon M...",0,1991,\N,\N,"Action,Adventure,Sci-Fi",7.4,82.0,"[""'Goodbye, Galaxy!' Episode V: The Armageddon..."
4983813,tt1918633,videoGame,.detuned,.detuned,0,2009,\N,\N,Music,2.3,28.0,"['.detuned', 'detuned: Gumi senpai no fushigi ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
8014707,tt6169512,videoGame,osu!,osu!,0,2007,\N,\N,"Music,Musical",7.4,107.0,['osu!']
295707,tt0308989,videoGame,ssn,ssn,0,1996,\N,\N,\N,7.9,13.0,['ssn']
770432,tt0795512,videoGame,Æon Flux,Æon Flux,0,2005,\N,\N,"Action,Adventure,Sci-Fi",5.6,265.0,['Æon Flux']
2185219,tt1219283,videoGame,Îhatôvo monogatari,Îhatôvo monogatari,0,1993,\N,\N,Adventure,7.0,6.0,"['Ihatovo Story', 'Îhatôvo monogatari']"


In [79]:
imdb_games.primaryTitle[imdb_games.duplicated(["primaryTitle", "startYear"], keep=False)].count()

0

### Remove giantbomb duplicates

As each game_id is unique, we want to group all the franchises for each game into a single result for each game_id. First though we check whether any of the the rows have the same game_id but a different name or release_year

In [80]:
giantbomb_games.count()

franchise_id    35116
game_id         35116
name            35116
release_year     5435
aliases          8088
developers      32746
genres          33660
platforms       34802
publishers      33604
rating          14184
dtype: int64

In [81]:
print(giantbomb_games.groupby("game_id").filter(lambda x: x["name"].nunique() > 1 or x["release_year"].nunique() > 1)["game_id"].count())

0


The next thing will be to remove any franchises which only contain 1 or 2 entries. Though they may technically count as a franchise, they do nothing to help us with analysis on the basis of longevity so we'll treat them as if they don't belong in a franchise

In [92]:
giantbomb_games.game_id.count()

35116

In [113]:
franchise_counts = giantbomb_games.groupby('franchise_id').size()
franchise_dict = {k:v for k,v in franchise_counts.items() if v < 3}
franchise_ids_to_drop = list(franchise_dict.keys())

mask = giantbomb_games['franchise_id'].isin(franchise_ids_to_drop)
giantbomb_games = giantbomb_games.drop(index=giantbomb_games[mask].index)

In [115]:
giantbomb_games.count()

franchise_id    31529
game_id         31529
name            31529
release_year     4668
aliases          7423
developers      29402
genres          30205
platforms       31251
publishers      30282
rating          13149
dtype: int64

The first thing to do is to combine all the games that have an identical franchise_id, release_year and name. With this criteria we will assume they are all the same game released on different platforms for example. We want to consider the release_year even when it is a nan value, so we convert all nan values to 0 accommodate this. Once we've done this, we group by game_id to combine the franchises for each individual game

In [117]:
giantbomb_games = giantbomb_games.groupby(["name", "franchise_id", giantbomb_games["release_year"].fillna(0)]).agg({
    "aliases": lambda x: list(set(x)),
    "developers": lambda x: list(set(x)),
    "genres": lambda x: list(set(x)),
    "platforms": lambda x: list(set(x)),
    "publishers": lambda x: list(set(x)),
    "rating": lambda x: list(set(x)),
    "game_id": "first"
}).reset_index().drop_duplicates(["game_id", "name", "franchise_id", "release_year"], keep="first")

giantbomb_games = giantbomb_games.groupby('game_id').agg({
    'franchise_id': lambda x: list(x),
    'name': 'first',
    'release_year': 'first',
    'aliases': 'first',
    'developers': 'first',
    'genres': 'first',
    "platforms": "first",
    "publishers" : "first",
    "rating": "first"
})

# Change year to string to match imdb database
giantbomb_games["release_year"] = giantbomb_games["release_year"].apply(lambda x : str(x).split('.')[0])

In [118]:
giantbomb_games.sort_values("game_id")

Unnamed: 0_level_0,franchise_id,name,release_year,aliases,developers,genres,platforms,publishers,rating
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,"[3025-143, 3025-2573]",Desert Strike: Return to the Gulf,1992,[Desert Strike Advance],"[['Electronic Arts', 'Visual Concepts', 'Budca...","[['Action', ""Shoot 'Em Up""]]","[['Amiga', 'Game Boy', 'Game Boy Advance', 'Ga...","[['Electronic Arts', 'Domark Software', 'Malib...","[['ESRB: K-A', 'ESRB: E']]"
3,[3025-2060],Hyperballoid Deluxe: Survival Pack,0,[nan],[['Kernel Kaput']],"[['Action', 'Block-Breaking']]",[['PC']],"[['Alawar Entertainment, Inc.']]",[nan]
4,[3025-1110],The Chessmaster 2000,1986,[nan],"[['The Software Toolworks', 'Software Country'...","[['Strategy', 'Trivia/Board Game']]","[['Amiga', 'Amstrad CPC', 'Apple II', 'Atari S...","[['The Software Toolworks', 'Software Country'...",[nan]
6,"[3025-128, 3025-130, 3025-1372]",WWE SmackDown! vs. RAW 2007,0,[SVR 2007],"[[""Yuke's Co. Ltd."", 'Digital Hearts Co., Ltd....","[['Action', 'Sports', 'Wrestling']]","[['PlayStation Portable', 'PlayStation 2', 'Xb...",[['THQ']],"[['ESRB: T', 'PEGI: 16+', 'CERO: C']]"
8,[3025-335],Super Spy Hunter,0,[Battle Formula],[['Tokai Engineering']],[['Vehicular Combat']],[['Nintendo Entertainment System']],[['Sunsoft']],[nan]
...,...,...,...,...,...,...,...,...,...
88822,[3025-2291],Winning Post 10,0,[nan],[nan],[['Driving/Racing']],"[['PC', 'PlayStation 4', 'Nintendo Switch', 'P...",[['Koei Tecmo']],[['CERO: A']]
88824,[3025-5696],Wan Nyan Dōbutsu Byōin,0,[nan],[nan],[['Simulation']],[['Game Boy Advance']],[['TDK Core']],[['CERO: All Ages']]
88831,[3025-5699],Shogi Saikyou: Pro ni Manabu,0,[nan],[['Magical Company']],[['Trivia/Board Game']],[['PlayStation']],[['Magical Company']],[nan]
88834,[3025-383],The Murder of Sonic the Hedgehog,0,[nan],[['Sega']],[['Adventure']],"[['Mac', 'PC']]",[['Sega']],[nan]


We can now get a record of the number of duplicate names in the data

In [119]:
giantbomb_games.name[giantbomb_games.duplicated(["name"], keep=False)].count()

560

There is 594 duplicate names. Adding the criteria to distinguish by both name and release year returns the following:

In [120]:
giantbomb_games.name[giantbomb_games.duplicated(["name", "release_year"], keep=False)].count()

72

The release year needs to be converted to a string so we can compare it to the imdb database. We also need to tmove trailing decimal points.

In [121]:
giantbomb_games["release_year"] = giantbomb_games["release_year"].apply(lambda x : str(x).split('.')[0])

Unfortunately, at this stage, this is the furthest we can go for dealing with duplicates. The imdb datasets do not provide any additional information that can be corroborated across the two dataframes. As such there is no way to distinguish between a duplicate in the imdb dataset if the name and year are not unique in the giantbomb dataset. From this point, we now need to remove any duplicates in the imdb dataset, where there is not a clear distinction within the giantbomb dataset. The first thing I will do is isolate the duplicates. Then drop from the duplicates dataframe anywhere we can clearly identify which version of a game that duplicate is. Once we've finished we'll be left with a small dataframe of duplicates which we have tried all methods to identify the original game and failed. Every record left in the small dataframe will be dropped from our full imdb dataframe

Isolate the duplicates:

In [122]:
imdb_duplicates = imdb_games[imdb_games.duplicated(["primaryTitle"], keep=False)]

In [123]:
imdb_duplicates.tconst.count()

599

Remove any duplicate names we have a direct match for name and year.

In [124]:
matching_imdb_dups = pd.merge(imdb_games, giantbomb_games, left_on=['primaryTitle', 'startYear'], right_on=['name', 'release_year'], how='inner')
matching_imdb_dups

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_x,averageRating,...,akas,franchise_id,name,release_year,aliases,developers,genres_y,platforms,publishers,rating
0,tt1968978,videoGame,'88 Games,Hyper Sports Special,0,1988,\N,\N,Sport,6.0,...,"[""Track & Field '88"", 'Hyper Sports Special', ...",[3025-1053],'88 Games,1988,[Konami '88\r\nHyper Sports Special],[['Konami']],[['Track & Field']],[['Arcade']],[['Konami']],[nan]
1,tt0490765,videoGame,007: Licence to Kill,007: Licence to Kill,0,1989,\N,\N,"Action,Adventure,Thriller",6.2,...,"['Licence to Kill', 'Lizenz zum Töten', '007: ...",[3025-369],007: Licence to Kill,1989,[Con Licencia para Matar\nPermis de Tuer\nLize...,"[['Quixel', 'Consult Software Ltd.']]","[['Action', 'Driving/Racing', 'Shooter']]","[['Amiga', 'Amstrad CPC', 'Atari ST', 'Commodo...","[['Domark Software', 'Erbe Software, S.A.', 'T...",[nan]
2,tt0185834,videoGame,10-Yard Fight,10-Yard Fight,0,1983,\N,\N,Sport,4.2,...,"['Ten Yado Faito', ""10-Yard Fight '85"", '10-Ya...",[3025-3081],10-Yard Fight,1983,[nan],[['Irem Corp.']],[['Football']],"[['MSX', 'Nintendo Entertainment System', 'Arc...","[['Irem Corp.', 'Nintendo', 'Taito Corporation...",[nan]
3,tt1288449,videoGame,18 Wheels of Steel: Across America,18 Wheels of Steel: Across America,0,2003,\N,\N,Family,6.8,...,['18 Wheels of Steel: Across America'],[3025-747],18 Wheels of Steel: Across America,2003,[nan],[['SCS Software']],"[['Driving/Racing', 'Simulation']]",[['PC']],[['Akella']],"[['ESRB: E', 'PEGI: 3+']]"
4,tt0185835,videoGame,1942,1942,0,1984,\N,\N,"Action,War",6.8,...,"['1942', 'Supercharger 1942']",[3025-11],1942,1984,[Capcom Arcade Cabinet: 1942\n1942 MOBILE],"[['Capcom', 'Micronics', 'Elite Systems Ltd.',...","[[""Shoot 'Em Up""]]","[['Amstrad CPC', 'Commodore 64', 'MSX', 'ZX Sp...","[['Capcom', 'Zafiro Software Division', 'Romst...",[['ESRB: E']]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659,tt0896586,videoGame,Zoo Tycoon 2: African Adventure,Zoo Tycoon 2: African Adventure,0,2006,\N,\N,\N,7.0,...,['Zoo Tycoon 2: African Adventure'],[3025-107],Zoo Tycoon 2: African Adventure,2006,[nan],"[['Blue Fang Games, LLC']]","[['Strategy', 'Simulation', 'Educational']]",[['PC']],[['Xbox Game Studios']],[['ESRB: E']]
660,tt0910982,videoGame,Zoo Tycoon 2: Endangered Species,Zoo Tycoon 2: Endangered Species,0,2005,\N,\N,\N,7.2,...,['Zoo Tycoon 2: Endangered Species'],[3025-107],Zoo Tycoon 2: Endangered Species,2005,[nan],"[['Blue Fang Games, LLC']]","[['Strategy', 'Simulation', 'Educational']]",[['PC']],[['1C Company']],"[['ESRB: E', 'PEGI: 3+']]"
661,tt0217905,videoGame,Zool,Zool,0,1992,\N,\N,"Action,Adventure,Sci-Fi",6.6,...,"['Zool', 'Zool: Ninja of the Nth Dimension', '...",[3025-768],Zool,1992,[Ninja from the Nth dimension\r\nZool no Yume ...,"[['Gremlin Interactive Ltd.', 'Cygnus Software...",[['Platformer']],"[['Amiga', 'Game Boy', 'Game Gear', 'Genesis',...","[['GameTek, Inc.', 'Gremlin Interactive Ltd.',...",[nan]
662,tt2976208,videoGame,Zool 2,Zool 2,0,1993,\N,\N,"Action,Adventure",7.1,...,['Zool 2'],[3025-768],Zool 2,1993,[nan],"[['Gremlin Interactive Ltd.', 'Warp Factory, T...","[['Action', 'Platformer']]","[['Amiga', 'Jaguar', 'Amiga CD32', 'PC']]","[['Gremlin Interactive Ltd.', 'Atari Games', '...",[nan]


In [125]:
tempMask = ~imdb_duplicates['tconst'].isin(matching_imdb_dups['tconst'])
imdb_duplicates = imdb_duplicates[tempMask]
imdb_duplicates.tconst.count()

518

Remove any duplicates from our giantbomb database where we have no record of the year of release

In [170]:
gb_duplicate_names = giantbomb_games.duplicated(subset='name', keep=False)
release_year_int = giantbomb_games['release_year'].astype(int).eq(0)
gb_duplicates = giantbomb_games.loc[gb_duplicate_names & release_year_int]
#Having got all the duplicates with no release year, we'll drop them from the dataframe
giantbomb_games = giantbomb_games[~giantbomb_games.index.isin(gb_duplicates.index)]

At this stage, we are at an impasse with the imdb duplicates. We have no way to identify them from the information that we can cross-reference with the giantbomb data. If we attempt to match on the basis of name, then every duplicate entry in imdb will be considered to be the same game in the giantbomb dataset, which will be erroneous. If we arbritrarily remove one of the imdb duplicates, then the matched giantbomb entry may be for the wrong imdb game. As such, the only way we can try to prevent this from skewing the data is to completely drop any of those which we can't conclusively identify.

In [177]:
imdb_games = imdb_games[~imdb_games['tconst'].isin(imdb_duplicates['tconst'])]
imdb_games.tconst.count()

13866

This leaves me with 13866 games in the imdb dataset to play with. The next step is to combine the imdb dataset with the giantbomb

In [180]:
giantbomb_games.name.count()

25367

In [179]:
giantbomb_games.name.nunique()

25287

The way I am going to do this is to first combine the rows where the year and name matches. Then combine any remaining rows where the name matches but the year doesn't match.

In [205]:
copy_of_imdb_games = imdb_games
copy_of_giantbomb_games = giantbomb_games.reset_index()
print("Total imdb:{}, Total giantbomb:{}".format(copy_of_imdb_games.tconst.count(), copy_of_giantbomb_games.name.count()))

Total imdb:13866, Total giantbomb:25367


In [206]:
mergedDF = pd.merge(copy_of_imdb_games, copy_of_giantbomb_games, left_on=["primaryTitle", "startYear"], right_on=["name", "release_year"], how="inner")
copy_of_imdb_games = copy_of_imdb_games[~copy_of_imdb_games['tconst'].isin(mergedDF['tconst'])]
copy_of_giantbomb_games = copy_of_giantbomb_games[~copy_of_giantbomb_games['game_id'].isin(mergedDF['game_id'])]
print("Total imdb: {}, Total giantbomb: {}, Total in df: {}".format(copy_of_imdb_games.tconst.count(), copy_of_giantbomb_games.name.count(), mergedDF.tconst.count()))

Total imdb: 13202, Total giantbomb: 24703, Total in df: 664


In [207]:
copy_of_imdb_games.primaryTitle[copy_of_imdb_games.duplicated(["primaryTitle"], keep=False)].count()

0

In [220]:
copy_of_giantbomb_games[copy_of_giantbomb_games.duplicated(["name"], keep=False)].sort_values("release_year").head(30)

Unnamed: 0,game_id,franchise_id,name,release_year,aliases,developers,genres,platforms,publishers,rating
13822,36496,[3025-4152],Heavyweight Champ,1976,[nan],[['Sega']],[['Boxing']],[['Arcade']],[['Sega']],[nan]
10472,26738,[3025-4002],Connect Four,1979,[nan],[['Milton Bradley Co.']],"[['Trivia/Board Game', 'Puzzle']]","[['TI-99/4A', 'Microvision']]",[['Milton Bradley Co.']],[nan]
11173,28623,[3025-3332],Flash Gordon,1980,[nan],"[['Bally Mfg. Corp.', 'Midway Games']]",[['Pinball']],[['Pinball']],[['Bally Mfg. Corp.']],[nan]
15874,42189,[3025-25],Alien,1981,[Super Alien],[['Commodore']],"[['Action', 'Strategy', 'Puzzle']]",[['VIC-20']],[['Commodore']],[nan]
750,1962,[3025-2875],King Kong,1982,[nan],[['Tigervision']],[['Action']],[['Atari 2600']],[['Tigervision']],[nan]
7903,20782,[3025-441],Q*Bert,1982,[nan],"[['Gottlieb', 'Konami', 'Tsukuda Original']]","[['Action', 'Puzzle']]","[['Commodore 64', 'Nintendo Entertainment Syst...","[['Gottlieb', 'Ultra Games', 'Konami']]",[['ESRB: E']]
21690,67671,[3025-2875],King Kong,1983,[nan],[nan],[['Platformer']],[['MicroBee']],[nan],[nan]
1132,2957,[3025-3332],Flash Gordon,1983,[Spider City\r\nF18 (Vs) Aliens\r\nSpace Adven...,"[['Sirius Software, Inc.']]","[[""Shoot 'Em Up""]]","[['Atari 8-bit', 'VIC-20', 'Atari 2600']]","[['20th Century Fox', 'Sirius Software, Inc.',...",[nan]
23743,80398,[3025-3170],Ashita no Joe,1983,[nan],[['System in Neko']],[['Adventure']],"[['NEC PC-8801', 'FM-7']]","[['CSK Software Products', 'Filcom']]",[nan]
5961,15637,[3025-328],Spy vs. Spy,1984,[nan],"[['First Star Software, Inc.']]","[['Action', 'Strategy']]","[['Amiga', 'Sega Master System', 'Amstrad CPC'...","[['First Star Software, Inc.', 'Kemco']]",[nan]


In [213]:
mergedDF2 = pd.merge(copy_of_imdb_games, copy_of_giantbomb_games, left_on="primaryTitle", right_on="name", how="inner")
copy_of_imdb_games = copy_of_imdb_games[~copy_of_imdb_games['tconst'].isin(mergedDF2['tconst'])]
copy_of_giantbomb_games = copy_of_giantbomb_games[~copy_of_giantbomb_games['game_id'].isin(mergedDF2['game_id'])]
print("Total imdb: {}, Total giantbomb: {}, Total in df2: {}".format(copy_of_imdb_games.tconst.count(), copy_of_giantbomb_games.name.count(), mergedDF2.tconst.count()))

Total imdb: 7759, Total giantbomb: 19252, Total in df2: 5451


In [221]:
copy_of_giantbomb_games[copy_of_giantbomb_games.duplicated(["name"], keep=False)].sort_values("release_year").head(30)

Unnamed: 0,game_id,franchise_id,name,release_year,aliases,developers,genres,platforms,publishers,rating
13822,36496,[3025-4152],Heavyweight Champ,1976,[nan],[['Sega']],[['Boxing']],[['Arcade']],[['Sega']],[nan]
10472,26738,[3025-4002],Connect Four,1979,[nan],[['Milton Bradley Co.']],"[['Trivia/Board Game', 'Puzzle']]","[['TI-99/4A', 'Microvision']]",[['Milton Bradley Co.']],[nan]
11173,28623,[3025-3332],Flash Gordon,1980,[nan],"[['Bally Mfg. Corp.', 'Midway Games']]",[['Pinball']],[['Pinball']],[['Bally Mfg. Corp.']],[nan]
15874,42189,[3025-25],Alien,1981,[Super Alien],[['Commodore']],"[['Action', 'Strategy', 'Puzzle']]",[['VIC-20']],[['Commodore']],[nan]
750,1962,[3025-2875],King Kong,1982,[nan],[['Tigervision']],[['Action']],[['Atari 2600']],[['Tigervision']],[nan]
7903,20782,[3025-441],Q*Bert,1982,[nan],"[['Gottlieb', 'Konami', 'Tsukuda Original']]","[['Action', 'Puzzle']]","[['Commodore 64', 'Nintendo Entertainment Syst...","[['Gottlieb', 'Ultra Games', 'Konami']]",[['ESRB: E']]
21690,67671,[3025-2875],King Kong,1983,[nan],[nan],[['Platformer']],[['MicroBee']],[nan],[nan]
1132,2957,[3025-3332],Flash Gordon,1983,[Spider City\r\nF18 (Vs) Aliens\r\nSpace Adven...,"[['Sirius Software, Inc.']]","[[""Shoot 'Em Up""]]","[['Atari 8-bit', 'VIC-20', 'Atari 2600']]","[['20th Century Fox', 'Sirius Software, Inc.',...",[nan]
23743,80398,[3025-3170],Ashita no Joe,1983,[nan],[['System in Neko']],[['Adventure']],"[['NEC PC-8801', 'FM-7']]","[['CSK Software Products', 'Filcom']]",[nan]
5961,15637,[3025-328],Spy vs. Spy,1984,[nan],"[['First Star Software, Inc.']]","[['Action', 'Strategy']]","[['Amiga', 'Sega Master System', 'Amstrad CPC'...","[['First Star Software, Inc.', 'Kemco']]",[nan]


In [201]:
mergedDF3 = pd.merge(copy_of_imdb_games, copy_of_giantbomb_games, left_on="originalTitle", right_on="name", how="inner")
copy_of_imdb_games = copy_of_imdb_games[~copy_of_imdb_games['tconst'].isin(mergedDF3['tconst'])]
copy_of_giantbomb_games = copy_of_giantbomb_games[~copy_of_giantbomb_games['game_id'].isin(mergedDF3['game_id'])]
print("Total imdb: {}, Total giantbomb: {}, Total in df3: {}".format(copy_of_imdb_games.tconst.count(), copy_of_giantbomb_games.name.count(), mergedDF3.tconst.count()))

Total imdb: 7655, Total giantbomb: 19147, Total in df3: 105


In [204]:
mergedDF4 = pd.concat([mergedDF, mergedDF2, mergedDF3], axis=0)
mergedDF4

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_x,averageRating,...,game_id,franchise_id,name,release_year,aliases,developers,genres_y,platforms,publishers,rating
0,tt1968978,videoGame,'88 Games,Hyper Sports Special,0,1988,\N,\N,Sport,6.0,...,43935,[3025-1053],'88 Games,1988,[Konami '88\r\nHyper Sports Special],[['Konami']],[['Track & Field']],[['Arcade']],[['Konami']],[nan]
1,tt0490765,videoGame,007: Licence to Kill,007: Licence to Kill,0,1989,\N,\N,"Action,Adventure,Thriller",6.2,...,6264,[3025-369],007: Licence to Kill,1989,[Con Licencia para Matar\nPermis de Tuer\nLize...,"[['Quixel', 'Consult Software Ltd.']]","[['Action', 'Driving/Racing', 'Shooter']]","[['Amiga', 'Amstrad CPC', 'Atari ST', 'Commodo...","[['Domark Software', 'Erbe Software, S.A.', 'T...",[nan]
2,tt0185834,videoGame,10-Yard Fight,10-Yard Fight,0,1983,\N,\N,Sport,4.2,...,10166,[3025-3081],10-Yard Fight,1983,[nan],[['Irem Corp.']],[['Football']],"[['MSX', 'Nintendo Entertainment System', 'Arc...","[['Irem Corp.', 'Nintendo', 'Taito Corporation...",[nan]
3,tt1288449,videoGame,18 Wheels of Steel: Across America,18 Wheels of Steel: Across America,0,2003,\N,\N,Family,6.8,...,14758,[3025-747],18 Wheels of Steel: Across America,2003,[nan],[['SCS Software']],"[['Driving/Racing', 'Simulation']]",[['PC']],[['Akella']],"[['ESRB: E', 'PEGI: 3+']]"
4,tt0185835,videoGame,1942,1942,0,1984,\N,\N,"Action,War",6.8,...,10466,[3025-11],1942,1984,[Capcom Arcade Cabinet: 1942\n1942 MOBILE],"[['Capcom', 'Micronics', 'Elite Systems Ltd.',...","[[""Shoot 'Em Up""]]","[['Amstrad CPC', 'Commodore 64', 'MSX', 'ZX Sp...","[['Capcom', 'Zafiro Software Division', 'Romst...",[['ESRB: E']]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,tt1235898,videoGame,Worldwide Soccer Manager 2008,Football Manager 2008,0,2007,\N,\N,Sport,8.4,...,20784,[3025-652],Football Manager 2008,0,[Footy Manager 2008\nFootie Manager 2008\nFM08...,[['Sports Interactive Limited']],"[['Strategy', 'Sports', 'Simulation', 'Soccer']]","[['Mac', 'PlayStation Portable', 'Xbox 360', '...",[['Sega']],[nan]
101,tt1325744,videoGame,Worldwide Soccer Manager 2009,Football Manager 2009,0,2008,\N,\N,Sport,8.3,...,23498,[3025-652],Football Manager 2009,0,[Worldwide Soccer Manager 2009],[['Sports Interactive Limited']],"[['Sports', 'Simulation', 'Soccer']]","[['Mac', 'PlayStation Portable', 'PC']]",[['Sega']],"[['OFLC: G', 'ESRB: E']]"
102,tt7451148,videoGame,Ys I: Ancient Ys Vanished,Ys: The Vanished Omens,0,1987,\N,\N,"Action,Adventure,Fantasy",7.2,...,14318,[3025-112],Ys: The Vanished Omens,0,[Ys: Ancient Ys Vanished\nYs 1\nAncient Land o...,"[['Nihon Falcom Corp.', 'Advance Communication...","[['Action', 'Role-Playing']]","[['Sega Master System', 'MSX', 'Nintendo Enter...","[['Kyodai Software Marketing, Inc.', 'Nihon Fa...",[nan]
103,tt0454989,videoGame,Ys VI: The Ark of Napishtim,Ys: The Ark of Napishtim,0,2003,\N,\N,"Action,Adventure,Fantasy",7.9,...,19621,[3025-112],Ys: The Ark of Napishtim,0,[Ys VI: The Ark of Napishtim],"[['Nihon Falcom Corp.', 'Konami Software Shang...","[['Action', 'Role-Playing']]","[['PlayStation Portable', 'PlayStation 2', 'PC']]","[['Konami', 'Nihon Falcom Corp.']]","[['ESRB: T', 'ESRB: E10+']]"


### All this is just playing around. I'm using multiple dataframes to test it

The next step is to attempt to fuzzy match the titles and the original titles from the imdb dataset against the giantbomb database

In [26]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

I have 68 games with different originalTitles and primaryTitles, 

Check whether you have multiple entries with the same name and no year in GB. If it does, and 

In [174]:
imdb_duplicates

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas
5813970,tt2273075,videoGame,Action 52,Action 52,0,1991,\N,\N,"Action,Family,Fantasy",1.3,141.0,['Action 52']
7523132,tt5058272,videoGame,Action 52,Action 52,0,1993,\N,\N,\N,3.1,28.0,['Action 52']
8286665,tt6777472,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 1, The Search for the ...",0,2009,\N,\N,\N,5.8,15.0,"[""Adam's Venture: Episode 1, The Search for th..."
8302190,tt6813690,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 2, Solomon's Secret",0,2011,\N,\N,\N,7.5,9.0,"[""Adam's Venture: Episode 2, Solomon's Secret""..."
8302192,tt6813694,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 3, Revelations",0,2012,\N,\N,\N,5.6,9.0,"[""Adam's Venture: Origins"", ""Adam's Venture: E..."
...,...,...,...,...,...,...,...,...,...,...,...,...
5475671,tt2132358,videoGame,Zombi,ZombiU,0,2012,\N,\N,"Action,Adventure,Horror",6.3,259.0,"['ZombiU', 'Zombi', 'Killer Freaks from Outer ..."
249553,tt0260590,videoGame,Zoo Keeper,Zoo Keeper,0,1983,\N,\N,Family,7.0,21.0,"['Zoo Keeper', 'King Crab']"
436868,tt0454991,videoGame,Zoo Keeper,Zoo Keeper,0,2004,\N,\N,Action,6.1,22.0,['Zoo Keeper']
301459,tt0314957,videoGame,Zoo Tycoon,Zoo Tycoon,0,2001,\N,\N,Action,7.4,279.0,['Zoo Tycoon']


In [139]:
temp = imdb_duplicates[~(imdb_duplicates["primaryTitle"]== imdb_duplicates["originalTitle"])]

In [134]:
temp = pd.merge(imdb_duplicates, giantbomb_games, left_on=['primaryTitle'], right_on=['name'], how='inner')
temp

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_x,averageRating,...,akas,franchise_id,name,release_year,aliases,developers,genres_y,platforms,publishers,rating
0,tt6777472,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 1, The Search for the ...",0,2009,\N,\N,\N,5.8,...,"[""Adam's Venture: Episode 1, The Search for th...",[3025-2362],Adam's Venture: Origins,0,[nan],[['Vertigo Games']],[['Adventure']],"[['PC', 'Xbox One', 'PlayStation 4', 'Nintendo...",[['Soedesco']],[nan]
1,tt6813690,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 2, Solomon's Secret",0,2011,\N,\N,\N,7.5,...,"[""Adam's Venture: Episode 2, Solomon's Secret""...",[3025-2362],Adam's Venture: Origins,0,[nan],[['Vertigo Games']],[['Adventure']],"[['PC', 'Xbox One', 'PlayStation 4', 'Nintendo...",[['Soedesco']],[nan]
2,tt6813694,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 3, Revelations",0,2012,\N,\N,\N,5.6,...,"[""Adam's Venture: Origins"", ""Adam's Venture: E...",[3025-2362],Adam's Venture: Origins,0,[nan],[['Vertigo Games']],[['Adventure']],"[['PC', 'Xbox One', 'PlayStation 4', 'Nintendo...",[['Soedesco']],[nan]
3,tt1978381,videoGame,Air Combat,Air Combat,0,1993,\N,\N,Action,5.5,...,['Air Combat'],[3025-16],Air Combat,1992,[Ace Combat],"[['Namco', 'Arsys Software, Inc.']]","[['Action', 'Simulation', 'Flight Simulator']]","[['PlayStation', 'Arcade']]",[['Namco']],[['ESRB: K-A']]
4,tt0429504,videoGame,Air Combat,Ace Combat,0,1995,\N,\N,"Action,Adventure,Sci-Fi",6.6,...,"['Ace Combat', 'Air Combat']",[3025-16],Air Combat,1992,[Ace Combat],"[['Namco', 'Arsys Software, Inc.']]","[['Action', 'Simulation', 'Flight Simulator']]","[['PlayStation', 'Arcade']]",[['Namco']],[['ESRB: K-A']]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624,tt4841246,videoGame,You Don't Know Jack,You Don't Know Jack,0,2011,\N,\N,\N,6.4,...,"[""You Don't Know Jack""]",[3025-115],You Don't Know Jack,0,"[YDKJ 2012, YDKJ 2011, YDKJ Vol. 1]","[['Jackbox Games', 'Berkeley Systems'], ['Jack...",[['Trivia/Board Game']],"[['iPhone', 'iPad', 'Android', 'Browser', 'Ouy...","[['Sierra'], ['THQ'], ['Jackbox Games']]","[nan, ['ESRB: T']]"
625,tt0260590,videoGame,Zoo Keeper,Zoo Keeper,0,1983,\N,\N,Family,7.0,...,"['Zoo Keeper', 'King Crab']",[3025-4845],Zoo Keeper,0,[Minna no Soft Series: ZOOO\nZooo\nZoo Puzzle\...,"[['Buddiez, Inc.', 'Success Corp.']]",[['Puzzle']],"[['Game Boy Advance', 'PlayStation 2', 'Ninten...","[['UTV Ignition Entertainment', '505 Games']]","[['PEGI: 3+', 'CERO: A', 'ESRB: E']]"
626,tt0454991,videoGame,Zoo Keeper,Zoo Keeper,0,2004,\N,\N,Action,6.1,...,['Zoo Keeper'],[3025-4845],Zoo Keeper,0,[Minna no Soft Series: ZOOO\nZooo\nZoo Puzzle\...,"[['Buddiez, Inc.', 'Success Corp.']]",[['Puzzle']],"[['Game Boy Advance', 'PlayStation 2', 'Ninten...","[['UTV Ignition Entertainment', '505 Games']]","[['PEGI: 3+', 'CERO: A', 'ESRB: E']]"
627,tt0314957,videoGame,Zoo Tycoon,Zoo Tycoon,0,2001,\N,\N,Action,7.4,...,['Zoo Tycoon'],[3025-107],Zoo Tycoon,0,"[nan, Zoo Tycoon DS]","[['Frontier Developments Ltd.'], ['Blue Fang G...","[['Strategy', 'Simulation'], ['Strategy', 'Sim...","[['Mac', 'Nintendo DS', 'PC'], ['Xbox 360', 'P...","[['Empire Interactive Entertainment', 'Xbox Ga...","[['ESRB: E'], ['ESRB: E', 'PEGI: 3+']]"


In [74]:
giantbomb_games.sort_values("game_id").head()

Unnamed: 0_level_0,franchise_id,name,release_year,aliases,developers,genres,platforms,publishers,rating
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,"[3025-143, 3025-2573]",Desert Strike: Return to the Gulf,1992,[Desert Strike Advance],"[['Electronic Arts', 'Visual Concepts', 'Budca...","[['Action', ""Shoot 'Em Up""]]","[['Amiga', 'Game Boy', 'Game Boy Advance', 'Ga...","[['Electronic Arts', 'Domark Software', 'Malib...","[['ESRB: K-A', 'ESRB: E']]"
3,[3025-2060],Hyperballoid Deluxe: Survival Pack,0,[nan],[['Kernel Kaput']],"[['Action', 'Block-Breaking']]",[['PC']],"[['Alawar Entertainment, Inc.']]",[nan]
4,[3025-1110],The Chessmaster 2000,1986,[nan],"[['The Software Toolworks', 'Software Country'...","[['Strategy', 'Trivia/Board Game']]","[['Amiga', 'Amstrad CPC', 'Apple II', 'Atari S...","[['The Software Toolworks', 'Software Country'...",[nan]
6,"[3025-128, 3025-130, 3025-1372]",WWE SmackDown! vs. RAW 2007,0,[SVR 2007],"[[""Yuke's Co. Ltd."", 'Digital Hearts Co., Ltd....","[['Action', 'Sports', 'Wrestling']]","[['PlayStation Portable', 'PlayStation 2', 'Xb...",[['THQ']],"[['ESRB: T', 'PEGI: 16+', 'CERO: C']]"
8,[3025-335],Super Spy Hunter,0,[Battle Formula],[['Tokai Engineering']],[['Vehicular Combat']],[['Nintendo Entertainment System']],[['Sunsoft']],[nan]


In [173]:
tempdb = pd.merge(imdb_duplicates, giantbomb_games, left_on=['primaryTitle'], right_on=['name'], how='inner')
tempdb

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_x,averageRating,...,akas,franchise_id,name,release_year,aliases,developers,genres_y,platforms,publishers,rating
0,tt6777472,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 1, The Search for the ...",0,2009,\N,\N,\N,5.8,...,"[""Adam's Venture: Episode 1, The Search for th...",[3025-2362],Adam's Venture: Origins,0,[nan],[['Vertigo Games']],[['Adventure']],"[['PC', 'Xbox One', 'PlayStation 4', 'Nintendo...",[['Soedesco']],[nan]
1,tt6813690,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 2, Solomon's Secret",0,2011,\N,\N,\N,7.5,...,"[""Adam's Venture: Episode 2, Solomon's Secret""...",[3025-2362],Adam's Venture: Origins,0,[nan],[['Vertigo Games']],[['Adventure']],"[['PC', 'Xbox One', 'PlayStation 4', 'Nintendo...",[['Soedesco']],[nan]
2,tt6813694,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 3, Revelations",0,2012,\N,\N,\N,5.6,...,"[""Adam's Venture: Origins"", ""Adam's Venture: E...",[3025-2362],Adam's Venture: Origins,0,[nan],[['Vertigo Games']],[['Adventure']],"[['PC', 'Xbox One', 'PlayStation 4', 'Nintendo...",[['Soedesco']],[nan]
3,tt1978381,videoGame,Air Combat,Air Combat,0,1993,\N,\N,Action,5.5,...,['Air Combat'],[3025-16],Air Combat,1992,[Ace Combat],"[['Namco', 'Arsys Software, Inc.']]","[['Action', 'Simulation', 'Flight Simulator']]","[['PlayStation', 'Arcade']]",[['Namco']],[['ESRB: K-A']]
4,tt0429504,videoGame,Air Combat,Ace Combat,0,1995,\N,\N,"Action,Adventure,Sci-Fi",6.6,...,"['Ace Combat', 'Air Combat']",[3025-16],Air Combat,1992,[Ace Combat],"[['Namco', 'Arsys Software, Inc.']]","[['Action', 'Simulation', 'Flight Simulator']]","[['PlayStation', 'Arcade']]",[['Namco']],[['ESRB: K-A']]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,tt4841246,videoGame,You Don't Know Jack,You Don't Know Jack,0,2011,\N,\N,\N,6.4,...,"[""You Don't Know Jack""]",[3025-115],You Don't Know Jack,0,"[YDKJ 2012, YDKJ 2011, YDKJ Vol. 1]","[['Jackbox Games', 'Berkeley Systems'], ['Jack...",[['Trivia/Board Game']],"[['iPhone', 'iPad', 'Android', 'Browser', 'Ouy...","[['Sierra'], ['THQ'], ['Jackbox Games']]","[nan, ['ESRB: T']]"
443,tt0260590,videoGame,Zoo Keeper,Zoo Keeper,0,1983,\N,\N,Family,7.0,...,"['Zoo Keeper', 'King Crab']",[3025-4845],Zoo Keeper,0,[Minna no Soft Series: ZOOO\nZooo\nZoo Puzzle\...,"[['Buddiez, Inc.', 'Success Corp.']]",[['Puzzle']],"[['Game Boy Advance', 'PlayStation 2', 'Ninten...","[['UTV Ignition Entertainment', '505 Games']]","[['PEGI: 3+', 'CERO: A', 'ESRB: E']]"
444,tt0454991,videoGame,Zoo Keeper,Zoo Keeper,0,2004,\N,\N,Action,6.1,...,['Zoo Keeper'],[3025-4845],Zoo Keeper,0,[Minna no Soft Series: ZOOO\nZooo\nZoo Puzzle\...,"[['Buddiez, Inc.', 'Success Corp.']]",[['Puzzle']],"[['Game Boy Advance', 'PlayStation 2', 'Ninten...","[['UTV Ignition Entertainment', '505 Games']]","[['PEGI: 3+', 'CERO: A', 'ESRB: E']]"
445,tt0314957,videoGame,Zoo Tycoon,Zoo Tycoon,0,2001,\N,\N,Action,7.4,...,['Zoo Tycoon'],[3025-107],Zoo Tycoon,0,"[nan, Zoo Tycoon DS]","[['Frontier Developments Ltd.'], ['Blue Fang G...","[['Strategy', 'Simulation'], ['Strategy', 'Sim...","[['Mac', 'Nintendo DS', 'PC'], ['Xbox 360', 'P...","[['Empire Interactive Entertainment', 'Xbox Ga...","[['ESRB: E'], ['ESRB: E', 'PEGI: 3+']]"


In [133]:
for index1 in imdb_duplicates.primaryTitle:
    #rint(index1)
    value = fuzz.token_sort_ratio("bction 2", index1)
    if value > 80:
        print(index1)
        print(value)

Action 52
82
Action 52
82


In [67]:
imdb_duplicates.head(500)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas
5813970,tt2273075,videoGame,Action 52,Action 52,0,1991,\N,\N,"Action,Family,Fantasy",1.3,141.0,['Action 52']
7523132,tt5058272,videoGame,Action 52,Action 52,0,1993,\N,\N,\N,3.1,28.0,['Action 52']
8286665,tt6777472,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 1, The Search for the ...",0,2009,\N,\N,\N,5.8,15.0,"[""Adam's Venture: Episode 1, The Search for th..."
8302190,tt6813690,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 2, Solomon's Secret",0,2011,\N,\N,\N,7.5,9.0,"[""Adam's Venture: Episode 2, Solomon's Secret""..."
8302192,tt6813694,videoGame,Adam's Venture: Origins,"Adam's Venture: Episode 3, Revelations",0,2012,\N,\N,\N,5.6,9.0,"[""Adam's Venture: Origins"", ""Adam's Venture: E..."
...,...,...,...,...,...,...,...,...,...,...,...,...
358721,tt0374336,videoGame,Wizard of Wor,Wizard of Wor,0,1981,\N,\N,"Action,Fantasy,Sci-Fi",7.7,34.0,['Wizard of Wor']
291841,tt0304947,videoGame,Wolfenstein 3D,Wolfenstein 3D,0,1992,\N,\N,"Action,Adventure,Sci-Fi",8.0,2242.0,['Wolfenstein 3D']
5558932,tt21632938,videoGame,Wolfenstein 3D,Wolfenstein 3D,0,1995,\N,\N,"Action,Sci-Fi,War",6.7,7.0,['Wolfenstein 3D']
8727036,tt7746896,videoGame,Worms,Worms,0,2007,\N,\N,\N,5.8,7.0,['Worms']


In [57]:
for index1 in imdb_duplicates.primaryTitle:
    #rint(index1)
    value = fuzz.token_sort_ratio("bction 2", index1)
    if value > 80:
        print(index1)
        print(value)

Action 52
82
Action 52
82


In [None]:
imdb_duplicates

In [None]:
imdb_duplicates = imdb_duplicates[imdb_duplicates['primaryTitle'].isin(giantbomb_games['name'])]
imdb_duplicates

Create dictionary with count for each duplicate

In [None]:
imdb_duplicate_counts

In [None]:
imdb_duplicate_counts = imdb_duplicates.groupby('primaryTitle').size()
imdb_duplicate_dict = {k:v for k,v in imdb_duplicate_counts.items() if v > 1}
imdb_duplicate_dict

In [68]:
giantbomb_games[giantbomb_games["name"] == "Worms"]


Unnamed: 0_level_0,franchise_id,name,release_year,aliases,developers,genres,platforms,publishers,rating
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2007,[3025-133],Worms,1995,[nan],[['Team17 Software Limited']],"[['Action', 'Strategy']]","[['Amiga', 'Game Boy', 'Genesis', 'Super Ninte...","[['Telegames, Inc.', 'Limited Run Games']]",[nan]
20815,[3025-133],Worms,0,[nan],[['Team17 Software Limited']],[['Strategy']],"[['Xbox 360 Games Store', 'PlayStation Network...",[['Team17 Software Limited']],[['ESRB: E10+']]


Note here, we are only going to consider direct matches of a name for brevity's sake. If giantbomb contains no records that match the name of an imdb duplicate, we will assume the duplicates are not part of a franchise.

In [None]:
for key, value in imdb_duplicate_dict.items():
    if giantbomb_games.name[giantbomb_games["name"] == key].count() == 0:
        # Game is not a franchise game, and can be kept in original imdb dataframe
        print(key)
        #imdb_duplicates = imdb_duplicates.drop(index=imdb_duplicates.index[imdb_duplicates['primaryTitle'] == key])
imdb_duplicates.count()

In [None]:
imdb_duplicates.count()

In [None]:
Action 52 is not in giantbomb games so we can keep in our original database as it's a unique game. Therefore we need to drop it from our dups dataframe

This leaves us with 411 duplicates, which exist in the imdb database but we may not be able to accurately match them with the 

WE ARE CREATING A DUPS DATAFRAME WHICH WILL BE ALL THE DUPLICATES WE CAN'T DISTINGUISH BETWENN. IF WE CAN DISTINGUISH, WE DROP FROM THE DUPS DATABASE. IF IMDB GAMES NAME DOES NOT APPEAR IN GIANTBOMB GAMES NAME, DROP FROM DUPLICATES LIST

sO FAR WE HAVE DONE ENOUGH. wE JUST NEED TO DO FUZZY MATCHING ON THE NAMES TO COMPARE.EG. ADAM'S VENTURE EXISTS IN BOTH BUT NEEDS FUZZY MATCHING AGAINST ORIGINAL TITLE

In [None]:
imdb_duplicates = imdb_duplicates[~imdb_duplicates['tconst'].isin(matching_imdb_dups['tconst'])]


In [None]:
for key, value in imdb_duplicate_dict.items():
    if giantbomb_games.name[giantbomb_games["name"] == key].count() == 0:
        # Game is not a franchise game, and can be kept in original imdb dataframe
        imdb_duplicates = imdb_duplicates.drop(index=imdb_duplicates.index[imdb_duplicates['primaryTitle'] == key])
    elif giantbomb_games.name[giantbomb_games["name"] == key].count() != value:
        #matching_ids holds index values of duplicates
        matching_imdb_ids=imdb_duplicates.index[imdb_duplicates['primaryTitle'] == key]
        matching_gb_ids = giantbomb_games.index[giantbomb_games['name'] == key]
        if 
        #for id in matching_imdb_ids:
            #This extracts the year
         #   year =imdb_duplicates.startYear[imdb_duplicates.index[imdb_duplicates.index == id]].item()
          #  if giantbomb_games.name[giantbomb_games["name"] == key] and giantbomb_games.release_year[giantbomb_games["release_year"] == year]:
           #     for 
            #    print("Great success!", year)
           # else:
            #    print("poo")
        
        print(key, value, giantbomb_games.name[giantbomb_games["name"] == key].count())

In [None]:
giantbomb_games[giantbomb_games.duplicated(["name"], keep=False)].count()

In [None]:
import re
temp = imdb_games[imdb_games["primaryTitle"].str.contains("Batman", na=False, flags=re.IGNORECASE, regex=True)]
temp

In [175]:
import re
temp2 = giantbomb_games[giantbomb_games["name"].str.contains("Adam's", na=False, flags=re.IGNORECASE, regex=True)]
temp2

Unnamed: 0_level_0,franchise_id,name,release_year,aliases,developers,genres,platforms,publishers,rating
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
30427,[3025-2362],Adam's Venture: Episode One - The Search for t...,2009,[Adams Venture],[['Vertigo Games']],[['Adventure']],[['PC']],[['Iceberg Interactive']],[nan]
34686,[3025-2362],Adam's Venture II: Solomon's Secret,2011,[nan],[['Vertigo Games']],"[['Adventure', 'Puzzle', 'Action-Adventure']]","[['Mac', 'PC']]",[['Iceberg Interactive']],[nan]
39166,[3025-2362],Adam's Venture Episode 3: Revelations,0,[nan],[['Vertigo Games']],[['Adventure']],[['PC']],[['Iceberg Interactive']],[nan]
45160,[3025-2362],Adam's Venture Chronicles,0,[nan],[['Vertigo Games']],[['Adventure']],"[['PlayStation Network (PS3)', 'PC', 'PlayStat...",[['Playlogic International N.V.']],"[['ESRB: E', 'PEGI: 7+']]"
52907,[3025-2362],Adam's Venture: Origins,0,[nan],[['Vertigo Games']],[['Adventure']],"[['PC', 'Xbox One', 'PlayStation 4', 'Nintendo...",[['Soedesco']],[nan]


In [None]:
duplicate_giantbomb_names = giantbomb_games[giantbomb_games.duplicated(["name", "release_year"], keep=False)]
duplicate_giantbomb_names = duplicate_giantbomb_names.sort_values("name")
duplicate_giantbomb_names

Get list of all game_ids

In [None]:
game_id_list = duplicate_giantbomb_names.index.to_list()

for each item in the list, get the corresponding json file. Clean to extract necessary data (esp. year)

In [None]:
for game_id, api_url in game_api_dict.items():
    filename = "{}.json".format(game_id)
    if not Path(game_dir, filename).is_file():

        

In [None]:
print(len(release_years))

In [None]:
num_nans = id_year_df.isna().sum().sum()
print(num_nans)

In [None]:
# group the data by name and count the number of occurrences
grouped = giantbomb_games.groupby(['name']).size().reset_index(name='count')

# select the rows where count > 1, indicating a duplicate name
duplicates = grouped[grouped['count'] > 1]

# select the rows with duplicate name and different game_id
duplicates = giantbomb_games[giantbomb_games['name'].isin(duplicates['name'])].groupby('name').filter(lambda x: x['game_id'].nunique() > 1)

# print the duplicate rows, sorted by name
if len(duplicates) > 0:
    print("Found {} rows with same name but different game_id:".format(len(duplicates)))
    print(duplicates.sort_values(by=['name', 'game_id']))
else:
    print("No rows with same name but different game_id found.")


In [None]:
temp2 = temp[temp.duplicated(["name", "index"], keep=False)]
temp2 = temp2.sort_values("name")
temp2

In [None]:
a = giantbomb_games[giantbomb_games["name"].str.contains('Sqrxz', na=False, flags=re.IGNORECASE, regex=True)]
a

The first thing I am going to do is check for any titles that appear only once in both dataframes and that have matching titles. With this criteria, we can assume that the games are the same. 

In [None]:
unmatching_rows = imdb_games[~imdb_games['primaryTitle'].isin(giantbomb_games['name'])]

In [None]:
try_this = unmatching_rows[unmatching_rows['originalTitle'].isin(giantbomb_games['name'])]
try_this.count()

In [None]:
import re
temp = giantbomb_games[giantbomb_games["name"].str.contains('Infernal', na=False, flags=re.IGNORECASE, regex=True)]
temp.head(50)

My current plan for the most accurate way to approach this:

First check for duplicate names in both datasets. If there is a duplicate name in either dataset, we need to check whether this is because there are 2 games with the same name. If this is the case, we need a way to distinguish between the 2. I would anticipate year of release could work.

The next step would be comparing names against alternate titles. After this, any that still have no matches, would be subject to fuzzy matching.I'd anticipate at least 75% of the games in the imdb dataset should be contained in giantbomb's database.

In [None]:
imdb_dups = imdb_games[imdb_games.duplicated(['primaryTitle', 'startYear'], keep=False)]
imdb_dups = imdb_dups.sort_values("primaryTitle")
print(imdb_dups)