In [167]:
import os
import sys

sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib as plt
import sklearn
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
le = LabelEncoder()

orig_df = pd.read_hdf("../data_extract/features_dataset_2b.h5")

In [227]:
def data_split(dataframe,training,test,cross):
    grouped_df = dataframe.groupby(['pid'],axis=0)
    df_pid = dataframe.pid.unique()

    training_set = []
    test_set = []
    cross_set = []
    
    for pid in df_pid:
        current_df = grouped_df.get_group(pid)
        file_grouped_df = current_df.groupby(['fname'],axis=0)
        df_fnames = current_df.fname.unique()

        training_len = (training*len(df_fnames))/100
        test_len = (test*len(df_fnames))/100
        cross_len = (cross*len(df_fnames))/100
        
        training_keys = df_fnames[0:training_len]
        test_keys = df_fnames[training_len:training_len+test_len]
        cross_keys = df_fnames[training_len+test_len:training_len+test_len+cross_len]

        training_df = pd.concat([file_grouped_df.get_group(key) for key in training_keys])
        test_df = pd.concat([file_grouped_df.get_group(key) for key in test_keys])
        cross_df = pd.concat([file_grouped_df.get_group(key) for key in cross_keys])
        
        training_set.append(training_df)
        test_set.append(training_df)
        cross_set.append(training_df)
        
    training_df = pd.concat(training_set)
    test_df = pd.concat(test_set)
    cross_df = pd.concat(cross_set)    
    return training_df,test_df,cross_df

In [228]:
NUM_PEOPLE = 10
df = orig_df.loc[orig_df['id']<NUM_PEOPLE]

In [233]:
from numpy import zeros
def gfit(chlist, xtrain, id_range):
    le = LabelEncoder()
    mm = list()
    covm = list()
    for pid in range(id_range):
        mm.append(np.mean(xtrain[chlist].loc[xtrain['id']==pid],axis=0))
        covm.append(np.cov(xtrain[chlist].loc[xtrain['id']==pid], rowvar=False))
    return mm, covm


def predict(lclist, x_test,mm,covm,y_test):
    cls = len(mm)
    total = y_test['id'].count()
    dist = zeros((total,cls))
    from scipy.stats import multivariate_normal
    for i in range(cls):
        dist[:,i] = multivariate_normal.pdf(x_test[lclist], mean=mm[i], cov=covm[i],allow_singular=True)
    le2 = LabelEncoder()
    le2.fit(y_test['fname'])
    totalfs = len(le2.classes_)
    print totalfs
    true_results = 0
    for fname in le2.classes_:
        bar = y_test['fname']==fname
        probc = dist[bar,:]
        probc[probc == 0.] = 1e-500
        pidc = y_test.loc[y_test['fname']==fname]['id'].iloc[0]
        plist = np.prod(probc,axis=0)
        print pidc
        print plist
        if plist[pidc] == max(plist):
            true_results = true_results + 1
    print(true_results,totalfs)
    print("Accuracy is {}%".format(true_results*100/totalfs))
    
def train_MLE(df):
    chlist = ["chroma{}".format(i) for i in range(12)]
    log_chroma_list = ["lc{}".format(i) for i in range(12)]
    df[log_chroma_list] = np.log(df[chlist])

#     lclist = ["mfcc{}".format(i) for i in range(14)] + log_chroma_list
    lclist = log_chroma_list

    df_train,df_test,df_cross = data_split(df,15,8,8)

    x_train = df_train[["id","fname"] + lclist]
#     y_train = df_train[["id","fname"]]

    x_cross = df_cross[["id","fname"] + lclist]
#     y_cross = df_cross[["id","fname"]]
    
    ## Mean, standard deviation Scaling 
    scaler = preprocessing.StandardScaler().fit(x_train[lclist])
    x_train.loc[:,lclist] = scaler.transform(x_train[lclist])
    x_cross.loc[:,lclist] = scaler.transform(x_cross[lclist])
    
    mm, covm = gfit(lclist, x_train, NUM_PEOPLE)
    predict(lclist, x_cross, mm, covm, x_cross)

In [232]:
train_MLE(df)

92
0
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
0
[  2.16213511e-130   3.78136505e-180   1.21105314e-169   8.48512387e-151
   9.36569116e-158   3.05499512e-201   2.52491565e-149   8.63128072e-177
   6.96100018e-163   4.97485396e-154]
0
[  3.90107051e-63   1.17503046e-82   1.24335699e-77   7.80195298e-66
   5.50103971e-71   1.44763742e-92   1.03158615e-66   5.40538467e-78
   1.62631543e-72   1.05051084e-64]
0
[  2.76120190e-114   9.84256245e-143   2.62660961e-142   2.45123565e-130
   8.74226556e-133   4.18866837e-171   3.83835738e-126   2.02867450e-142
   4.38670357e-135   5.53956113e-133]
0
[  1.84756043e-119   6.41002657e-167   1.25437182e-159   3.50559936e-138
   1.33744574e-142   2.65490242e-187   2.00879739e-137   6.63928720e-159
   2.26914310e-153   1.26323843e-143]
0
[  3.72336935e-071   1.73192547e-097   7.34932860e-086   1.35113031e-078
   7.20040059e-077   1.68116930e-107   2.95086179e-079   1.71360333e-096
   1.07886171e-082   3.34326592e-083]
0
[  6.23017760e-097   5.38833795