In [1]:
import sys
import pandas as pd
import numpy as np
import random
import itertools
import logging
from collections import Counter

In [2]:
def generate_ngram(df, n, content='--both'):
    if content == '--both':
        ngram = np.zeros((len(df) - n + 1, n * 2))
        for i in range(len(df) - n + 1):
            nlength = np.array(df['note length'].iloc[i:i + n])
            pitch = np.array(df['pitch'].iloc[i:i + n])
            gram = np.append(nlength, pitch)
            ngram[i] = gram
    elif content == '--length':
        ngram = np.zeros((len(df) - n + 1, n))
        for i in range(len(df) - n + 1):
            nlength = np.array(df['note length'].iloc[i:i + n])
            ngram[i] = nlength
    elif content == "--pitch":
        ngram = np.zeros((len(df) - n + 1, n))
        for i in range(len(df) - n + 1):
            pitch = np.array(df['pitch'].iloc[i:i + n])
            ngram[i] = pitch
    return ngram

In [3]:
column_names = ['Id', 'Performer', 'Title', 'Inst.', 'Style', 'Year', 'Tempo', 'Number of Notes']
output_columns = ['Performer', 'Inst.', 'Style', 'Year', 'Tempo']
N = 3
profile_size = 1000

In [4]:
def get_profile(song_df, N):
    ngram = generate_ngram(song_df, N, '--both')
    c = Counter()
    for x in ngram:
        c[tuple([float(nr) for nr in x])] += 1
    return c

def similarity(type_profile, song_profile):
    new_type_profile = dict()
    new_song_profile = dict()
    for k in list(type_profile.keys()) + list(set(song_profile.keys()) - set(type_profile.keys())):
        new_type_profile[k] = type_profile[k] if k in type_profile else 0
        new_song_profile[k] = song_profile[k] if k in song_profile else 0
    return sum([4 - ((2 * (new_type_profile[k] - new_song_profile[k])) / 
                     (new_type_profile[k] + new_song_profile[k])) ** 2 for k in new_type_profile.keys()])

In [5]:
train = "./training-data-file-0.csv"

In [6]:
df = pd.read_csv(train, sep=';', index_col=0, names=column_names)
df['profile'] = df.apply(lambda row: get_profile(pd.read_csv("./unigram/" + str(row.name) + ".csv", index_col=0), N), axis=1) 

In [7]:
type_profiles = pd.DataFrame(columns=['type', 'name', 'profile'])
for output_column in output_columns:  # For each output type (e.g. Performer, Year,…)…
    grouped = df.groupby(output_column)  # …group the rows by that output type…
    for name, group in grouped:
        profile = group.loc[:, 'profile'].sum()
        print("Before: profile size is", len(profile))
        flat_profile = list(profile.items())
        flat_profile = flat_profile[:profile_size]
        profile = Counter(dict(flat_profile))
        print("After: profile size is", len(profile))
        type_profiles = type_profiles.append({'type': output_column, 'name': name, 'profile': profile}, ignore_index=True)

Before: profile size is 2072
After: profile size is 1000
Before: profile size is 1349
After: profile size is 1000
Before: profile size is 1201
After: profile size is 1000
Before: profile size is 576
After: profile size is 576
Before: profile size is 2634
After: profile size is 1000
Before: profile size is 1917
After: profile size is 1000
Before: profile size is 943
After: profile size is 943
Before: profile size is 694
After: profile size is 694
Before: profile size is 1358
After: profile size is 1000
Before: profile size is 1558
After: profile size is 1000
Before: profile size is 2948
After: profile size is 1000
Before: profile size is 1276
After: profile size is 1000
Before: profile size is 2839
After: profile size is 1000
Before: profile size is 1197
After: profile size is 1000
Before: profile size is 1023
After: profile size is 1000
Before: profile size is 1178
After: profile size is 1000
Before: profile size is 1862
After: profile size is 1000
Before: profile size is 1915
After: p