In [1]:
import os
import sys

sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib as plt
import sklearn
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
le = LabelEncoder()

orig_df = pd.read_hdf("../data_extract/features_dataset_2b.h5")

In [2]:
def data_split(dataframe,training,test,cross):
    grouped_df = dataframe.groupby(['pid'],axis=0)
    df_pid = dataframe.pid.unique()

    training_set = []
    test_set = []
    cross_set = []
    
    for pid in df_pid:
        current_df = grouped_df.get_group(pid)
        file_grouped_df = current_df.groupby(['fname'],axis=0)
        df_fnames = current_df.fname.unique()

        training_len = (training*len(df_fnames))/100
        test_len = (test*len(df_fnames))/100
        cross_len = (cross*len(df_fnames))/100
        
        training_keys = df_fnames[0:training_len]
        test_keys = df_fnames[training_len:training_len+test_len]
        cross_keys = df_fnames[training_len+test_len:training_len+test_len+cross_len]

        training_df = pd.concat([file_grouped_df.get_group(key) for key in training_keys])
        test_df = pd.concat([file_grouped_df.get_group(key) for key in test_keys])
        cross_df = pd.concat([file_grouped_df.get_group(key) for key in cross_keys])
        
        training_set.append(training_df)
        test_set.append(training_df)
        cross_set.append(training_df)
        
    training_df = pd.concat(training_set)
    test_df = pd.concat(test_set)
    cross_df = pd.concat(cross_set)    
    return training_df,test_df,cross_df

In [3]:
NUM_PEOPLE = 40
df = orig_df.loc[orig_df['id']<NUM_PEOPLE]

In [6]:
from numpy import zeros
def gfit(chlist, x_train, id_range):
    le = LabelEncoder()
    mm = list()
    covm = list()
    for pid in range(id_range):
        mm.append(np.mean(x_train[chlist].loc[x_train['id']==pid],axis=0))
        covm.append(np.cov(x_train[chlist].loc[x_train['id']==pid], rowvar=False))
    return mm, covm


def predict(lclist, x_test,mm,covm):
    cls = len(mm)
    total = x_test['id'].count()
    dist = zeros((total,cls))
    
    from scipy.stats import multivariate_normal
    for i in range(cls):
        dist[:,i] = multivariate_normal.pdf(x_test[lclist], mean=mm[i], cov=covm[i],allow_singular=True)
    
    le2 = LabelEncoder()
    le2.fit(x_test['fname'])
    
    # totalfs will store the number of unique files
    totalfs = len(le2.classes_)
    
    true_results = 0
    # le2.classes_ contains list of unique files
    for fname in le2.classes_:
        bar = x_test['fname']==fname
        probc = dist[bar,:]
        pidc = x_test.loc[x_test['fname']==fname]['id'].iloc[0]
        # plist = np.prod(probc,axis=0)
        plist = np.sum(np.log(probc), axis = 0)
        if pidc == np.argmax(plist):
            true_results = true_results + 1
    
    print(true_results,totalfs)
    print("Accuracy is {}%".format(true_results*100/totalfs))
    
    
def train_MLE(df):
    chlist = ["chroma{}".format(i) for i in range(12)]
    log_chroma_list = ["lc{}".format(i) for i in range(12)]
    df[log_chroma_list] = np.log(df[chlist])

    lclist = ["mfcc{}".format(i) for i in range(14)] + log_chroma_list
#     lclist = log_chroma_list

    df_train,df_test,df_cross = data_split(df,60,20,20)

    ## Mean, standard deviation Scaling 
    scaler = preprocessing.StandardScaler().fit(df_train[lclist])
    df_train.loc[:,lclist] = scaler.transform(df_train[lclist])
    df_cross.loc[:,lclist] = scaler.transform(df_cross[lclist])
    
    mm, covm = gfit(lclist, df_train, NUM_PEOPLE)
    predict(lclist, df_cross, mm, covm)

In [7]:
train_MLE(df)

(1582, 1608)
Accuracy is 98%
