In [1]:
import pandas as pd
from ast import literal_eval
import os
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LinearRegression,SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

df = pd.read_csv('midi_dataframe.csv', parse_dates=[11])

# Cleaning
df.drop_duplicates(subset='file_name', inplace=True)
df.dropna(subset=['tgdb_genres'], inplace=True)
categories_filter = df.console.isin(['Medleys', 'Piano Only'])
remix_filter = df.title.str.contains('[Rr]emix')

df = df[~categories_filter & ~remix_filter]
df_stripped = df.copy()
df_stripped.title = df.title.str.replace('\(\d+\)', '').str.rstrip()
df_stripped['rank'] = df.title.str.extract('\((\d+)\)', expand=False)
df = df_stripped.sort_values(by='rank', ascending=False).groupby(['brand', 'console', 'game', 'title']).first().reset_index()
df.set_index('file_name', inplace=True)



In [2]:
num_games = len(df.groupby(['tgdb_platform', 'tgdb_gametitle']))
print('There is %d midi files, from %d games, with %d midis matched with tgdb'
      %(len(df),
        num_games,
        (df.tgdb_gametitle.notnull().sum())))
df_features = pd.read_csv('features.csv').set_index('file_name')
print("There is %d midi files with computed features"%(len(df_features)))

print("After removing not valid midi files, we have %d files"%(len(df.join(df_features, how='inner'))))

There is 22179 midi files, from 3243 games, with 22179 midis matched with tgdb
There is 31483 midi files with computed features
After removing not valid midi files, we have 22005 files


In [3]:
df_genres = df.tgdb_genres.map(literal_eval, 'ignore').apply(pd.Series).stack().reset_index(level=1, drop=True).reset_index().set_index('file_name')
df_genres.columns = ['genre']

In [4]:
df_pred = df_genres.join(df_features, how='inner').reset_index()
df_pred.head()

Unnamed: 0,file_name,genre,Prevalence_of_Most_Common_Pitch,Prevalence_of_Most_Common_Pitch_Class,Relative_Prevalence_of_Top_Pitches,Relative_Prevalence_of_Top_Pitch_Classes,Interval_Between_Most_Prevalent_Pitches,Interval_Between_Most_Prevalent_Pitch_Classes,Number_of_Common_Pitches,Pitch_Variety,...,Parallel_Motion,Similar_Motion,Contrary_Motion,Oblique_Motion,Parallel_Fifths,Parallel_Octaves,Dynamic_Range,Variation_of_Dynamics,Variation_of_Dynamics_In_Each_Voice,Average_Note_to_Note_Change_in_Dynamics
0,0004cc74bde3bc82f44afb5f566187c3dbcea9b2.mid,Platform,0.1593,0.2242,0.7222,0.75,5.0,5.0,2.0,28.0,...,0.0,0.0,0.0,0.0,0.0,0.0,63.0,17.65,6.324,13.78
1,0005d95e4aed74b84826f6090fcc8e52bceb5cb3.mid,Action,0.1316,0.2368,0.6,0.7037,5.0,7.0,1.0,20.0,...,0.6522,0.0,0.2609,0.08696,0.0,0.0,47.0,19.18,0.8643,0.3128
2,0005d95e4aed74b84826f6090fcc8e52bceb5cb3.mid,Adventure,0.1316,0.2368,0.6,0.7037,5.0,7.0,1.0,20.0,...,0.6522,0.0,0.2609,0.08696,0.0,0.0,47.0,19.18,0.8643,0.3128
3,0005d95e4aed74b84826f6090fcc8e52bceb5cb3.mid,Sports,0.1316,0.2368,0.6,0.7037,5.0,7.0,1.0,20.0,...,0.6522,0.0,0.2609,0.08696,0.0,0.0,47.0,19.18,0.8643,0.3128
4,0005d95e4aed74b84826f6090fcc8e52bceb5cb3.mid,Strategy,0.1316,0.2368,0.6,0.7037,5.0,7.0,1.0,20.0,...,0.6522,0.0,0.2609,0.08696,0.0,0.0,47.0,19.18,0.8643,0.3128


In [5]:
df_pred = df_pred.replace([np.inf, -np.inf], np.nan).dropna()
df_pred = df_pred[(df_pred['genre'] == 'Action') | (df_pred['genre'] == 'Role-Playing') | (df_pred['genre'] == 'Platform') | (df_pred['genre'] == 'Adventure') | (df_pred['genre'] == 'Shooter')]
X = df_pred.iloc[:, 2:]
y = df_pred['genre']
X.shape, y.shape

((27010, 156), (27010,))

In [6]:
X.head()

Unnamed: 0,Prevalence_of_Most_Common_Pitch,Prevalence_of_Most_Common_Pitch_Class,Relative_Prevalence_of_Top_Pitches,Relative_Prevalence_of_Top_Pitch_Classes,Interval_Between_Most_Prevalent_Pitches,Interval_Between_Most_Prevalent_Pitch_Classes,Number_of_Common_Pitches,Pitch_Variety,Pitch_Class_Variety,Range,...,Parallel_Motion,Similar_Motion,Contrary_Motion,Oblique_Motion,Parallel_Fifths,Parallel_Octaves,Dynamic_Range,Variation_of_Dynamics,Variation_of_Dynamics_In_Each_Voice,Average_Note_to_Note_Change_in_Dynamics
0,0.1593,0.2242,0.7222,0.75,5.0,5.0,2.0,28.0,12.0,53.0,...,0.0,0.0,0.0,0.0,0.0,0.0,63.0,17.65,6.324,13.78
1,0.1316,0.2368,0.6,0.7037,5.0,7.0,1.0,20.0,8.0,36.0,...,0.6522,0.0,0.2609,0.08696,0.0,0.0,47.0,19.18,0.8643,0.3128
2,0.1316,0.2368,0.6,0.7037,5.0,7.0,1.0,20.0,8.0,36.0,...,0.6522,0.0,0.2609,0.08696,0.0,0.0,47.0,19.18,0.8643,0.3128
5,0.1737,0.3215,0.8037,0.9109,19.0,5.0,6.0,18.0,9.0,32.0,...,0.05882,0.4006,0.07283,0.4678,0.02801,0.0,61.0,10.66,9.833,0.1922
6,0.1345,0.2461,0.8797,0.9471,43.0,7.0,4.0,28.0,8.0,67.0,...,0.1076,0.4637,0.05523,0.3735,0.0625,0.007267,125.0,27.31,11.2,2.405


In [7]:
y.head()

0        Platform
1          Action
2       Adventure
5    Role-Playing
6       Adventure
Name: genre, dtype: object

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle = True, random_state=42)
print(str(X_train.shape[0]) + " MIDIs are used for training, " + str(X_test.shape[0]) + " are used for testing.")

16206 MIDIs are used for training, 10804 are used for testing.


In [9]:
X_train.shape, y_train.shape

((16206, 156), (16206,))

In [10]:
X_test.shape, y_test.shape

((10804, 156), (10804,))

In [11]:
rfc = RandomForestClassifier(n_estimators=30, max_features = None, max_depth = 10, random_state = 1)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.30090707145501666

In [12]:
y_factorized, uniques = pd.factorize(y)

X_train, X_test, y_train_factorized, y_test_factorized = train_test_split(X, y_factorized, test_size=0.4, shuffle = True, random_state=42)

print(str(X_train.shape[0]) + " MIDIs are used for training, " + str(X_test.shape[0]) + " are used for testing.")

lrc = LinearRegression()
lrc.fit(X_train, y_train_factorized)
lrc.score(X_test, y_test_factorized)

16206 MIDIs are used for training, 10804 are used for testing.


0.03197849584802759

In [13]:
sgc = SGDClassifier(max_iter=1000)
sgc.fit(X_train, y_train_factorized)
sgc.score(X_test, y_test_factorized)

0.31173639392817476

In [25]:
param_grid = { 
    'n_estimators': [100, 200],
    'max_features': [None],
   # 'max_depth'   : [2, 10, 20]    
}

rfc = RandomForestClassifier(random_state = 1) 
cross_validation_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid)
cross_validation_rfc.fit(X_train, y_train)
cross_validation_rfc.best_params_

{'max_features': None, 'n_estimators': 200}

In [27]:
cross_validation_rfc.best_score_

0.29606318647414537

In [21]:
clf = SVC(kernel = 'rbf', gamma=1)
clf.fit(X_train, y_train_factorized) 
clf.score(X_test, y_test_factorized)

0.1751203258052573

In [17]:
abc = AdaBoostClassifier()
abc.fit(X_train, y_train_factorized)
abc.score(X_test, y_test_factorized)

0.3565346168085894