In [105]:
#!/usr/bin/env python
#Andrea Burns, Machine Learning Project
import os, sys, csv
import time
import scipy
import numpy as np
import _pickle as pickle
import getpass

In [106]:
def term_frequency_mat(codebook_lengh, wordids, wordcnts):
    term_freq = []
    for counter, (ids, cnts) in enumerate(zip(wordids, wordcnts)):
        vec = np.array([0] * codebook_lengh)
        for i, cnt in zip(ids, cnts):
            vec[i] = cnt
        term_freq.append(vec)
        
    feature_space = np.vstack(term_freq)
    return feature_space

In [107]:
def load_term_frequency():

    # ****************************************************************************************************
    # load word counts
    # ****************************************************************************************************
    directory = './'

    with open("codebook_data.p", 'rb') as f:
        loaded_data = pickle.load(f)
        
    code_book = loaded_data[0]
    graphlets = loaded_data[1]
    codebook_lengh = len(code_book)

    uuids, wordids, wordcts = [], [], []
    video_uuids, true_labels = [], []
    num_of_vids = 493
    
    for task in range(1, num_of_vids):
        file = open("vid" + str(task) + ".p",'rb')
        (uuid, label, ids, histogram) = pickle.load(file, encoding='latin1')
        wordids.append(ids)
        wordcts.append(histogram)
        video_uuids.append(uuid)
        true_labels.append(label)
    
    online_data = wordids, wordcts, true_labels, video_uuids
    term_freq = term_frequency_mat(codebook_lengh, wordids, wordcts)
    return term_freq, true_labels, code_book, graphlets #online_data

In [108]:
text_labels = ['microwaving_food','openning_double_doors','printing_interface','printing_take_printout','take_from_fridge','take_paper_towel','take_tea_bag', 'throw_trash','use_kettle','use_water_cooler','washing_up']

In [109]:
def high_instance_code_words(term_frequency, code_book, graphlets, low_instance):
    """This essentially takes the feature space created over all events, and removes any
    feature that is not witnessed in a minimum number of observations (low_instance param).
    """
    ## Number of rows with non zero element :
    keep_rows = np.where((term_frequency != 0).sum(axis=0) > low_instance)[0]   # removes code words if they dont appear in a minimum number of videos
    # keep_rows = np.where(term_frequency.sum(axis=0) > low_instance)[0]        # removes code words if they dont appear a minimum number of times across all videos

    ## Sum of the whole column: term_frequency.sum(axis=0) > low_instance
    remove_inds = np.where((term_frequency != 0).sum(axis=0) <= low_instance)[0]

    print("orig feature space: %s. remove: %s. new space: %s." % (len(term_frequency.sum(axis=0)), len(remove_inds), len(keep_rows)))

    #keep only the columns of the feature space which have more than low_instance number of occurances.
    selected_features = term_frequency.T[keep_rows]
    new_term_frequency = selected_features.T
    print("new feature space shape: ", new_term_frequency.shape)

    # # Code Book (1d np array of hash values)
    new_code_book = code_book[keep_rows]
    print("  new code book len: ", len(new_code_book))

    # # Graphlets book (1d np array of igraphs)
    new_graphlets = graphlets[keep_rows]
    print("  new graphlet book len: ", len(new_graphlets))

    print("removed low (%s) instance graphlets" % low_instance)
    print("shape = ", new_term_frequency.shape)

    return new_term_frequency, new_code_book, new_graphlets

In [110]:
(tfreq, tlabels, cbook, glets) = load_term_frequency()

In [111]:
(final_tfreq,final_cbook,final_glets) = high_instance_code_words(tfreq,cbook,glets,1)

orig feature space: 22829. remove: 12640. new space: 10189.
new feature space shape:  (492, 10189)
  new code book len:  10189
  new graphlet book len:  10189
removed low (1) instance graphlets
shape =  (492, 10189)


In [112]:
np.unique(tlabels)

array(['microwaving_food', 'openning_double_doors', 'printing_interface',
       'printing_take_printout', 'take_from_fridge', 'take_paper_towel',
       'take_tea_bag', 'throw_trash', 'use_kettle', 'use_water_cooler',
       'washing_up'], 
      dtype='<U22')

In [113]:
class_labels = []
for i in range(0,len(tlabels)):
    if tlabels[i] == 'microwaving_food':
        class_labels.append(0)
    if tlabels[i] == 'openning_double_doors':
        class_labels.append(1)
    if tlabels[i] == 'printing_interface':
        class_labels.append(2)
    if tlabels[i] == 'printing_take_printout':
        class_labels.append(3)
    if tlabels[i] == 'take_from_fridge':
        class_labels.append(4)
    if tlabels[i] == 'take_paper_towel':
        class_labels.append(5)
    if tlabels[i] == 'take_tea_bag':
        class_labels.append(6)
    if tlabels[i] == 'throw_trash':
        class_labels.append(7)
    if tlabels[i] == 'use_kettle':
        class_labels.append(8)
    if tlabels[i] == 'use_water_cooler':
        class_labels.append(9)
    if tlabels[i] == 'washing_up':
        class_labels.append(10)

In [114]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_tfreq, class_labels, test_size=0.2, random_state=42)

In [115]:
from sklearn.model_selection import cross_val_score
c_values = [.0001,.001,.01,.1,1]
for i in range(0,5):
    c = c_values[i]
    clf = svm.SVC(kernel='linear', C=c)
    scores = cross_val_score(clf, final_tfreq, class_labels, cv=5)
    print(scores)
    print(sum(scores)/5)
'''
I used cross validation to see the best C value, which is .1, as was correctly in the authors code too
I just wanted to double check because they don't show how they figure out their C value
'''

[ 0.43137255  0.56122449  0.52040816  0.59183673  0.61458333]
0.543885054022
[ 0.60784314  0.76530612  0.74489796  0.79591837  0.8125    ]
0.745293117247
[ 0.71568627  0.80612245  0.78571429  0.7755102   0.83333333]
0.783273309324
[ 0.73529412  0.83673469  0.79591837  0.78571429  0.85416667]
0.801565626251
[ 0.70588235  0.81632653  0.79591837  0.75510204  0.8125    ]
0.777145858343


"\nI used cross validation to see the best C value, which is .1, as was correctly in the authors code too\nI just wanted to double check because they don't show how they figure out their C value\n"

In [116]:
gammas = [.00001,.0001,.001,.01,.05,.1,.2,.3,.4,.5,.6,.7,.8,.9,1]
for i in range(0,len(gammas)):
    clf_rbf = svm.SVC(kernel='rbf', C=.1,gamma=gammas[i])
    scores = cross_val_score(clf_rbf, final_tfreq, class_labels, cv=5)
    #print(scores)
    print(sum(scores)/5)
'''
I used cross validation to see the best C value, which is .1, as was correctly in the authors code too
I just wanted to double check because they don't show how they figure out their C value
'''

0.263675470188
0.288915566226
0.286427070828
0.194965486194
0.172629051621
0.172629051621
0.172629051621
0.172629051621
0.172629051621
0.172629051621
0.172629051621
0.172629051621
0.172629051621
0.172629051621
0.172629051621


"\nI used cross validation to see the best C value, which is .1, as was correctly in the authors code too\nI just wanted to double check because they don't show how they figure out their C value\n"

In [117]:
from sklearn import svm
classifier = svm.SVC(kernel='linear', C=.1).fit(X_train, y_train)
predicted = classifier.predict(X_test)

classifier_nonlinear = svm.SVC(kernel='rbf').fit(X_train, y_train)
predicted_nonlinear = classifier_nonlinear.predict(X_test)

In [118]:
pred_train = classifier.predict(X_train)
pred_train_nonlinear = classifier.predict(X_train)

train_accuracy = 0
train_accuracy_nonlinear = 0
for i in range(0,len(pred_train)):
    if pred_train[i] == y_train[i]:
        train_accuracy += 1
    if pred_train_nonlinear[i] == y_train[i]:
        train_accuracy_nonlinear += 1
train_accuracy = train_accuracy/len(pred_train)
train_accuracy_nonlinear = train_accuracy_nonlinear/len(pred_train_nonlinear)
print('Train accuracy for linear kernel: ' + str(train_accuracy))
print('Train accuracy for RBF kernel: ' + str(train_accuracy_nonlinear))

Train accuracy for linear kernel: 0.9949109414758269
Train accuracy for RBF kernel: 0.9949109414758269


In [119]:
accuracy = 0
accuracy_nonlinear = 0
for i in range(0,len(predicted)):
    if predicted[i] == y_test[i]:
        accuracy += 1
    if predicted_nonlinear[i] == y_test[i]:
        accuracy_nonlinear += 1
accuracy = accuracy/len(predicted)
accuracy_nonlinear = accuracy_nonlinear/len(predicted_nonlinear)
print('Test accuracy for linear kernel: ' + str(accuracy))
print('Test accuracy for RBF kernel: ' + str(accuracy_nonlinear))

Test accuracy for linear kernel: 0.797979797979798
Test accuracy for RBF kernel: 0.5656565656565656


In [120]:
from sklearn.metrics import confusion_matrix
conf1 = confusion_matrix(y_test, predicted)
conf2 = confusion_matrix(y_test, predicted_nonlinear)

In [121]:
conf1

array([[ 4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  7,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 11,  0,  0,  1,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  6,  0,  3,  1,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  4,  4,  0,  9,  0,  0,  1],
       [ 0,  0,  0,  0,  1,  0,  0,  0,  8,  0,  0],
       [ 2,  0,  0,  0,  0,  0,  0,  0,  0,  3,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0, 17]])

In [122]:
conf2

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2],
       [ 0,  0,  3,  3,  0,  0,  0,  0,  0,  0,  2],
       [ 0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  2],
       [ 0,  0,  0,  0, 11,  0,  0,  0,  1,  0,  0],
       [ 0,  0,  0,  0,  6,  0,  0,  2,  0,  0,  2],
       [ 0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  1],
       [ 0,  0,  0,  0, 12,  0,  0,  4,  0,  0,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  3],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 18]])

In [123]:
from sklearn import metrics
vscore = metrics.v_measure_score(y_test, predicted)
hscore = metrics.homogeneity_score(y_test, predicted)
cscore = metrics.completeness_score(y_test, predicted)
mscore = metrics.mutual_info_score(y_test, predicted)
nscore = metrics.normalized_mutual_info_score(y_test, predicted)

vscore_nonlin = metrics.v_measure_score(y_test, predicted_nonlinear)
hscore_nonlin = metrics.homogeneity_score(y_test, predicted_nonlinear)
cscore_nonlin = metrics.completeness_score(y_test, predicted_nonlinear)
mscore_nonlin = metrics.mutual_info_score(y_test, predicted_nonlinear)
nscore_nonlin = metrics.normalized_mutual_info_score(y_test, predicted_nonlinear)

print('V measure score for linear kernel: ' + str(vscore))
print('Homogeneity score for linear kernel: ' + str(hscore))
print('Completeness measure score for linear kernel: ' + str(cscore))
print('Mutual information shared score for linear kernel: ' + str(mscore))
print('Normalized mutual information shared  score for linear kernel: ' + str(nscore))
print()
print('V measure score for nonlinear kernel: ' + str(vscore_nonlin))
print('Homogeneity score for nonlinear kernel: ' + str(hscore_nonlin))
print('Completeness measure score for nonlinear kernel: ' + str(cscore_nonlin))
print('Mutual information shared score for nonlinear kernel: ' + str(mscore_nonlin))
print('Normalized mutual information shared  score for nonlinear kernel: ' + str(nscore_nonlin))

V measure score for linear kernel: 0.773796709736
Homogeneity score for linear kernel: 0.773916944847
Completeness measure score for linear kernel: 0.77367651198
Mutual information shared score for linear kernel: 1.73005097406
Normalized mutual information shared  score for linear kernel: 0.773796719075

V measure score for nonlinear kernel: 0.608339048659
Homogeneity score for nonlinear kernel: 0.538942894946
Completeness measure score for nonlinear kernel: 0.698247861612
Mutual information shared score for nonlinear kernel: 1.20477873831
Normalized mutual information shared  score for nonlinear kernel: 0.613445779126


In [124]:
#precision per class is the diagonal value for that row over the sum of rows
precisions = []
precisions_nonlinear = []
for i in range(0,11):
    precisions.append(conf1[i][i]/np.sum(conf1[i][:]))
    precisions_nonlinear.append(conf2[i][i]/np.sum(conf2[i][:]))

In [125]:
for z in range(0,11):
    print('Precision with linear kernel of class ' + text_labels[z] + ' is: ' + str(precisions[z]))
    print('Precision with rbf kernel of class ' + text_labels[z] + ' is: ' + str(precisions_nonlinear[z]))
    print()

Precision with linear kernel of class microwaving_food is: 1.0
Precision with rbf kernel of class microwaving_food is: 0.0

Precision with linear kernel of class openning_double_doors is: 1.0
Precision with rbf kernel of class openning_double_doors is: 0.0

Precision with linear kernel of class printing_interface is: 0.875
Precision with rbf kernel of class printing_interface is: 0.375

Precision with linear kernel of class printing_take_printout is: 1.0
Precision with rbf kernel of class printing_take_printout is: 0.5

Precision with linear kernel of class take_from_fridge is: 0.916666666667
Precision with rbf kernel of class take_from_fridge is: 0.916666666667

Precision with linear kernel of class take_paper_towel is: 0.6
Precision with rbf kernel of class take_paper_towel is: 0.0

Precision with linear kernel of class take_tea_bag is: 0.888888888889
Precision with rbf kernel of class take_tea_bag is: 0.888888888889

Precision with linear kernel of class throw_trash is: 0.5
Precisio

In [126]:
#recall per class is the diagonal value for that row over the sum of columns
recalls = []
sum_column = 0
for i in range(0,11):
    for j in range(0,11):
        sum_column += conf1[j][i]
    recalls.append(conf1[i][i]/sum_column)
    sum_column=0

In [127]:
recalls_nonlinear = []
sum_column2 = 0
for i in range(0,11):
    for j in range(0,11):
        sum_column2 += conf2[j][i]
    if(sum_column2==0):
        recalls_nonlinear.append(0)
    else:
        recalls_nonlinear.append(conf2[i][i]/sum_column2)
    sum_column2=0

In [128]:
for z in range(0,11):
    print('Recall with linear kernel of class ' + text_labels[z] + ' is: ' + str(recalls[z]))
    print('Recall with rbf kernel of class ' + text_labels[z] + ' is: ' + str(recalls_nonlinear[z]))
    print()

Recall with linear kernel of class microwaving_food is: 0.666666666667
Recall with rbf kernel of class microwaving_food is: 0

Recall with linear kernel of class openning_double_doors is: 1.0
Recall with rbf kernel of class openning_double_doors is: 0

Recall with linear kernel of class printing_interface is: 1.0
Recall with rbf kernel of class printing_interface is: 1.0

Recall with linear kernel of class printing_take_printout is: 0.8
Recall with rbf kernel of class printing_take_printout is: 0.4

Recall with linear kernel of class take_from_fridge is: 0.6875
Recall with rbf kernel of class take_from_fridge is: 0.379310344828

Recall with linear kernel of class take_paper_towel is: 0.545454545455
Recall with rbf kernel of class take_paper_towel is: 0

Recall with linear kernel of class take_tea_bag is: 1.0
Recall with rbf kernel of class take_tea_bag is: 1.0

Recall with linear kernel of class throw_trash is: 0.692307692308
Recall with rbf kernel of class throw_trash is: 0.6666666666

In [129]:
f_values = []
f_values_nonlin = []
for i in range(0,11):
    f_values.append(2*precisions[i]*recalls[i]/(precisions[i]+recalls[i]))
    print('F1 value with nonlinear kernel of class ' + text_labels[i] + ' is: ' + str(f_values[i]))
    if(precisions_nonlinear[i]+recalls_nonlinear[i]==0):
            f_values_nonlin.append(0)
    else:
        f_values_nonlin.append(2*precisions_nonlinear[i]*recalls_nonlinear[i]/(precisions_nonlinear[i]+recalls_nonlinear[i]))
    print('F1 value with nonlinear kernel of class ' + text_labels[i] + ' is: ' + str(f_values_nonlin[i]))
    print()

F1 value with nonlinear kernel of class microwaving_food is: 0.8
F1 value with nonlinear kernel of class microwaving_food is: 0

F1 value with nonlinear kernel of class openning_double_doors is: 1.0
F1 value with nonlinear kernel of class openning_double_doors is: 0

F1 value with nonlinear kernel of class printing_interface is: 0.933333333333
F1 value with nonlinear kernel of class printing_interface is: 0.545454545455

F1 value with nonlinear kernel of class printing_take_printout is: 0.888888888889
F1 value with nonlinear kernel of class printing_take_printout is: 0.444444444444

F1 value with nonlinear kernel of class take_from_fridge is: 0.785714285714
F1 value with nonlinear kernel of class take_from_fridge is: 0.536585365854

F1 value with nonlinear kernel of class take_paper_towel is: 0.571428571429
F1 value with nonlinear kernel of class take_paper_towel is: 0

F1 value with nonlinear kernel of class take_tea_bag is: 0.941176470588
F1 value with nonlinear kernel of class take_

In [130]:
'''
Wanted to attempt unsupervised clustering, similar to what others did. For some reason they used 10 clusters,
but I think 11 make more sense because there are 11 activity classes. I'll try it with both 10 and 11. I also 
now rerun the clustering and average the metrics over 10 runs so that they are more telling of the clustering,
just as the authors did.
'''
from sklearn import metrics
from sklearn.cluster import KMeans

vscores = []
hscores = []
cscores = []
mscores = []
nscores = []

for i in range(0,10):
    kmeans = KMeans(n_clusters=11,init='k-means++')
    kmeans.fit(final_tfreq)

    pred_labels = kmeans.labels_
    
    vscore = metrics.v_measure_score(class_labels, pred_labels)
    hscore = metrics.homogeneity_score(class_labels, pred_labels)
    cscore = metrics.completeness_score(class_labels, pred_labels)
    mscore = metrics.mutual_info_score(class_labels, pred_labels)
    nscore = metrics.normalized_mutual_info_score(class_labels, pred_labels)
    
    vscores.append(vscore)
    hscores.append(hscore)
    cscores.append(cscore)
    mscores.append(mscore)
    nscores.append(nscore)
    
'''
these are five different evaluation metrics typically used for k-means or unsupervised learning paradigms
v_measure score is supposed to be the same as the normalized mutual info score but they're slightly different-
not sure why. v measure/normalized mutual info score is the measure of similarity between the two labels of the same
data, but normalized to be between 0 and 1, with 1 being a perfect correlation adn 0 being no mutual information.
mutual_info_score is just not normalized yet.

Homogeneity: if all of its clusters contain only data points which are members of a single class
Completeness: if all the data points that are members of a given class are elements of the same cluster
'''
overall_v = sum(vscores)/len(vscores)
overall_h = sum(hscores)/len(hscores)
overall_c = sum(cscores)/len(cscores)
overall_m = sum(mscores)/len(mscores)
overall_n = sum(nscores)/len(nscores)

print(overall_v)
print(overall_h)
print(overall_c)
print(overall_m)
print(overall_n)

0.39863452183
0.347435298619
0.46895771842
0.780699656933
0.403381017711


In [131]:
'''
I'm now going to try MeanShift clustering because it doesn't necessarily assign all data points to a cluster,
and doesn't ask for a specific number of clusters as input. I think this data is probably pretty noisy 
based off the results with kmeans, so I'd like to see how different the results end up being
'''
from sklearn.cluster import MeanShift
mean_cluster = MeanShift()
mean_cluster.fit(final_tfreq)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [132]:
mc_labels = mean_cluster.labels_

In [133]:
mc_vscore = metrics.v_measure_score(class_labels, mc_labels)
mc_hscore = metrics.homogeneity_score(class_labels, mc_labels)
mc_cscore = metrics.completeness_score(class_labels, mc_labels)
mc_mscore = metrics.mutual_info_score(class_labels, mc_labels)
mc_nscore = metrics.normalized_mutual_info_score(class_labels, mc_labels)

print(mc_vscore)
print(mc_hscore)
print(mc_cscore)
print(mc_mscore)
print(mc_nscore)

0.22257772949
0.171868799082
0.31573298559
0.386195395268
0.232947738911


In [134]:
'''
It appears that meanshift actually performs significantly worse because it just doens't account for
a lot of the noise, and a large amount of the data is noisy so this actually doesn't help. I'll try affinity 
propagation clustering below which does account for the noise and assigns each data point to a cluster.
'''
from sklearn.cluster import AffinityPropagation
af_cluster = AffinityPropagation()
af_cluster.fit(final_tfreq)
af_labels = af_cluster.labels_

af_vscore = metrics.v_measure_score(class_labels, af_labels)
af_hscore = metrics.homogeneity_score(class_labels, af_labels)
af_cscore = metrics.completeness_score(class_labels, af_labels)
af_mscore = metrics.mutual_info_score(class_labels, af_labels)
af_nscore = metrics.normalized_mutual_info_score(class_labels, af_labels)

print(af_vscore)
print(af_hscore)
print(af_cscore)
print(af_mscore)
print(af_nscore)
#seems to do a much better job!

0.48200689336
0.583819239325
0.410431642691
1.31186290415
0.489507803238


In [135]:
#trying with 10 clusters now
vscores_ten = []
hscores_ten = []
cscores_ten = []
mscores_ten = []
nscores_ten = []

for i in range(0,10):
    kmeans = KMeans(n_clusters=10,init='k-means++')
    kmeans.fit(final_tfreq)

    pred_labels = kmeans.labels_
    
    vscore_ten = metrics.v_measure_score(class_labels, pred_labels_ten)
    hscore_ten = metrics.homogeneity_score(class_labels, pred_labels_ten)
    cscore_ten = metrics.completeness_score(class_labels, pred_labels_ten)
    mscore_ten = metrics.mutual_info_score(class_labels, pred_labels_ten)
    nscore_ten = metrics.normalized_mutual_info_score(class_labels, pred_labels_ten)
    
    vscores_ten.append(vscore_ten)
    hscores_ten.append(hscore_ten)
    cscores_ten.append(cscore_ten)
    mscores_ten.append(mscore_ten)
    nscores_ten.append(nscore_ten)
    
overall_v_ten = sum(vscores_ten)/len(vscores_ten)
overall_h_ten = sum(hscores_ten)/len(hscores_ten)
overall_c_ten = sum(cscores_ten)/len(cscores_ten)
overall_m_ten = sum(mscores_ten)/len(mscores_ten)
overall_n_ten = sum(nscores_ten)/len(nscores_ten)

print(overall_v_ten)
print(overall_h_ten)
print(overall_c_ten)
print(overall_m_ten)
print(overall_n_ten)

0.394215876887
0.339945494957
0.46910600636
0.763869797752
0.399337543338
