In [249]:
import pandas as pd

df = pd.read_csv('midi_dataframe.csv', parse_dates=[11])
num_midis_before = len(df)
print('There is %d midi files, from %d games, with %d midis matched with tgdb'
      %(num_midis_before,
        len(df.groupby(['brand', 'console', 'game'])),
        (df.tgdb_gametitle.notnull().sum())))
df.head()

There is 31685 midi files, from 3918 games, with 31249 midis matched with tgdb


Unnamed: 0,brand,console,game,title,file_name,tgdb_genres,tgdb_id,tgdb_developer,tgdb_publisher,tgdb_platform,tgdb_gametitle,tgdb_releasedate,tgdb_players,tgdb_rating,tgdb_esrb,tgdb_overview
0,Nintendo,NES,10-Yard Fight,Kick Off,b5da1581fd2b425122446a638363a28296aee817.mid,['Sports'],317.0,Nintendo,Nintendo,Nintendo Entertainment System (NES),10-Yard Fight,1985-10-18,2,4.5714,E - Everyone,The game is viewed in a top-down perspective a...
1,Miscellaneous,Arcade,1941: Counter Attack,Krote Armor Stage,efe29bdfc97d46a327467ea5d6e36bff6edc4223.mid,['Shooter'],2434.0,Capcom,Capcom,Arcade,1941: Counter Attack,1990-02-01,2,6.3333,,The goal is to shoot down enemy airplanes and ...
2,Miscellaneous,Arcade,1941: Counter Attack,Level 6 Section 1,a356a1fd4960e4faa875c13bf7d03cfef3255cfd.mid,['Shooter'],2434.0,Capcom,Capcom,Arcade,1941: Counter Attack,1990-02-01,2,6.3333,,The goal is to shoot down enemy airplanes and ...
3,Computer Systems,Commodore 64,1942,Main Theme,52e23ff501c107b2055d543d8249e96f43eab97e.mid,['Shooter'],6434.0,,Elite,Commodore 64,1942,NaT,1,7.0,,1942 is set in the Pacific theater of World Wa...
4,Nintendo,NES,1943,Assault on Surface Forces B,26634701e919c25fddd82b00d63d75e1dedb5326.mid,['Shooter'],1018.0,Capcom,Capcom,Nintendo Entertainment System (NES),1943: The Battle of Midway,1987-06-01,2,4.5,E - Everyone,The game is set in the Pacific theater of Worl...


We first check that every midi file is only present once, if not we drop the rows.

In [250]:
num_dup = df.duplicated(subset='file_name').sum()
df.drop_duplicates(subset='file_name', inplace=True)
print('There was %d duplicated midi files, %d midis left'%(num_dup, len(df)))

There was 2 duplicated midi files, 31683 midis left


Since we are interested in the genre, we only keep midis that have one.

In [251]:
num_genres_na = df.tgdb_genres.isnull().sum()
df.dropna(subset=['tgdb_genres'], inplace=True)
print("We removed %d midis, %d midis left"%(num_genres_na, len(df)))

We removed 436 midis, 31247 midis left


Then, there are some categories, such as Medleys or Piano only that are not interesting.

There is also a big "remix" scene on vgmusic, so we also remove those.

In [252]:
categories_filter = df.console.isin(['Medleys', 'Piano Only'])
remix_filter = df.title.str.contains('[Rr]emix')

df = df[~categories_filter & ~remix_filter]
print('We removed %d midis from Medleys and Piano categories'%categories_filter.sum())
print('We removed %d midis containing "remix" in their title'%remix_filter.sum())
print('%d midis left'%len(df))

We removed 872 midis from Medleys and Piano categories
We removed 1860 midis containing "remix" in their title
28537 midis left


There often exists several versions of the same midi file, most of the time denoted by 'title (1)', 'title (2)', etc.

We also consider removing those, but keeping only the one with the highest value, or if there are several with the same title, we randomly keep one.

In [253]:
num_midis_before = len(df)

df_stripped = df.copy()
df_stripped.title = df.title.str.replace('\(\d+\)', '').str.rstrip()
df_stripped['rank'] = df.title.str.extract('\((\d+)\)', expand=False)
df = df_stripped.sort_values(by='rank', ascending=False).groupby(['brand', 'console', 'game', 'title']).first().reset_index()
print("We removed %d midis, %d midis left"%(num_midis_before-len(df), len(df)))

We removed 6358 midis, 22179 midis left
