In [4]:
#!/usr/bin/env python
#Andrea Burns, Machine Learning Project
import os, sys, csv
import time
import scipy
import numpy as np
import _pickle as pickle
import getpass

In [5]:
def term_frequency_mat(codebook_lengh, wordids, wordcnts):
    term_freq = []
    for counter, (ids, cnts) in enumerate(zip(wordids, wordcnts)):
        vec = np.array([0] * codebook_lengh)
        for i, cnt in zip(ids, cnts):
            vec[i] = cnt
        term_freq.append(vec)
        
    feature_space = np.vstack(term_freq)
    return feature_space

In [13]:
def load_term_frequency():

    # ****************************************************************************************************
    # load word counts
    # ****************************************************************************************************
    directory = './'

    with open("codebook_data.p", 'rb') as f:
        loaded_data = pickle.load(f)
        
    code_book = loaded_data[0]
    graphlets = loaded_data[1]
    codebook_lengh = len(code_book)

    uuids, wordids, wordcts = [], [], []
    video_uuids, true_labels = [], []
    num_of_vids = 493
    
    for task in range(1, num_of_vids):
        file = open("vid" + str(task) + ".p",'rb')
        (uuid, label, ids, histogram) = pickle.load(file, encoding='latin1')
        wordids.append(ids)
        wordcts.append(histogram)
        video_uuids.append(uuid)
        true_labels.append(label)
    
    online_data = wordids, wordcts, true_labels, video_uuids
    term_freq = term_frequency_mat(codebook_lengh, wordids, wordcts)
    return term_freq, true_labels, code_book, graphlets #online_data

In [51]:
text_labels = ['microwaving_food','openning_double_doors','printing_interface','printing_take_printout','take_from_fridge','take_paper_towel','take_tea_bag', 'throw_trash','use_kettle','use_water_cooler','washing_up']

In [14]:
class_labels = []
for i in range(0,len(b)):
    if b[i] == 'microwaving_food':
        class_labels.append(0)
    if b[i] == 'openning_double_doors':
        class_labels.append(1)
    if b[i] == 'printing_interface':
        class_labels.append(2)
    if b[i] == 'printing_take_printout':
        class_labels.append(3)
    if b[i] == 'take_from_fridge':
        class_labels.append(4)
    if b[i] == 'take_paper_towel':
        class_labels.append(5)
    if b[i] == 'take_tea_bag':
        class_labels.append(6)
    if b[i] == 'throw_trash':
        class_labels.append(7)
    if b[i] == 'use_kettle':
        class_labels.append(8)
    if b[i] == 'use_water_cooler':
        class_labels.append(9)
    if b[i] == 'washing_up':
        class_labels.append(10)

In [15]:
def high_instance_code_words(term_frequency, code_book, graphlets, low_instance):
    """This essentially takes the feature space created over all events, and removes any
    feature that is not witnessed in a minimum number of observations (low_instance param).
    """
    ## Number of rows with non zero element :
    keep_rows = np.where((term_frequency != 0).sum(axis=0) > low_instance)[0]   # removes code words if they dont appear in a minimum number of videos
    # keep_rows = np.where(term_frequency.sum(axis=0) > low_instance)[0]        # removes code words if they dont appear a minimum number of times across all videos

    ## Sum of the whole column: term_frequency.sum(axis=0) > low_instance
    remove_inds = np.where((term_frequency != 0).sum(axis=0) <= low_instance)[0]

    print("orig feature space: %s. remove: %s. new space: %s." % (len(term_frequency.sum(axis=0)), len(remove_inds), len(keep_rows)))

    #keep only the columns of the feature space which have more than low_instance number of occurances.
    selected_features = term_frequency.T[keep_rows]
    new_term_frequency = selected_features.T
    print("new feature space shape: ", new_term_frequency.shape)

    # # Code Book (1d np array of hash values)
    new_code_book = code_book[keep_rows]
    print("  new code book len: ", len(new_code_book))

    # # Graphlets book (1d np array of igraphs)
    new_graphlets = graphlets[keep_rows]
    print("  new graphlet book len: ", len(new_graphlets))

    print("removed low (%s) instance graphlets" % low_instance)
    print("shape = ", new_term_frequency.shape)

    return new_term_frequency, new_code_book, new_graphlets

In [16]:
(tfreq, tlabels, cbook, glets) = load_term_frequency()

In [17]:
(final_tfreq,final_cbook,final_glets) = high_instance_code_words(tfreq,cbook,glets,1)

orig feature space: 22829. remove: 12640. new space: 10189.
new feature space shape:  (492, 10189)
  new code book len:  10189
  new graphlet book len:  10189
removed low (1) instance graphlets
shape =  (492, 10189)


In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_tfreq, class_labels, test_size=0.2, random_state=42)

In [19]:
from sklearn import svm
classifier = svm.SVC(kernel='linear', C=.1).fit(X_train, y_train)
predicted = classifier.predict(X_test)

In [33]:
pred_train = classifier.predict(X_train)
train_accuracy = 0
for i in range(0,len(pred_train)):
    if pred_train[i] == y_train[i]:
        train_accuracy += 1
train_accuracy = train_accuracy/len(pred_train)
print('Train accuracy: ' + str(train_accuracy))

Train accuracy: 0.9949109414758269


In [20]:
accuracy = 0
for i in range(0,len(predicted)):
    if predicted[i] == y_test[i]:
        accuracy += 1
accuracy = accuracy/len(predicted)
print('Test accuracy: ' + str(accuracy))

0.797979797979798


In [37]:
from sklearn.metrics import confusion_matrix
conf = confusion_matrix(y_test, predicted)

In [38]:
conf

array([[ 4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  7,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 11,  0,  0,  1,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  6,  0,  3,  1,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  4,  4,  0,  9,  0,  0,  1],
       [ 0,  0,  0,  0,  1,  0,  0,  0,  8,  0,  0],
       [ 2,  0,  0,  0,  0,  0,  0,  0,  0,  3,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0, 17]])

In [96]:
from sklearn import metrics
vscore = metrics.v_measure_score(y_test, predicted)
hscore = metrics.homogeneity_score(y_test, predicted)
cscore = metrics.completeness_score(y_test, predicted)
mscore = metrics.mutual_info_score(y_test, predicted)
nscore = metrics.normalized_mutual_info_score(y_test, predicted)

print(vscore)
print(hscore)
print(cscore)
print(mscore)
print(nscore)

0.773796709736
0.773916944847
0.77367651198
1.73005097406
0.773796719075


In [55]:
#precision per class is the diagonal value for that row over the sum of rows
precisions = []
for i in range(0,11):
    precisions.append(conf[i][i]/np.sum(conf[i][:]))

4
2
8
4
12
10
9
18
9
5
18


In [52]:
for z in range(0,11):
    print('Precision of class ' + text_labels[z] + ' is: ' + str(precisions[z]))

Precision of class microwaving_food is: 1.0
Precision of class openning_double_doors is: 1.0
Precision of class printing_interface is: 0.875
Precision of class printing_take_printout is: 1.0
Precision of class take_from_fridge is: 0.916666666667
Precision of class take_paper_towel is: 0.6
Precision of class take_tea_bag is: 0.888888888889
Precision of class throw_trash is: 0.5
Precision of class use_kettle is: 0.888888888889
Precision of class use_water_cooler is: 0.6
Precision of class washing_up is: 0.944444444444


In [67]:
#recall per class is the diagonal value for that row over the sum of columns
recalls = []
sum_column = 0
for i in range(0,11):
    for j in range(0,11):
        sum_column += conf[j][i]
    recalls.append(conf[i][i]/sum_column)
    sum_column=0

In [68]:
for z in range(0,11):
    print('Recall of class ' + text_labels[z] + ' is: ' + str(recalls[z]))

Recall of class microwaving_food is: 0.666666666667
Recall of class openning_double_doors is: 1.0
Recall of class printing_interface is: 1.0
Recall of class printing_take_printout is: 0.8
Recall of class take_from_fridge is: 0.6875
Recall of class take_paper_towel is: 0.545454545455
Recall of class take_tea_bag is: 1.0
Recall of class throw_trash is: 0.692307692308
Recall of class use_kettle is: 0.888888888889
Recall of class use_water_cooler is: 1.0
Recall of class washing_up is: 0.894736842105


In [72]:
f_values = []
for i in range(0,11):
    f_values.append(2*precisions[i]*recalls[i]/(precisions[i]+recalls[i]))
    print('F1 value of class ' + text_labels[i] + ' is: ' + str(f_values[i]))

F1 value of class microwaving_food is: 0.8
F1 value of class openning_double_doors is: 1.0
F1 value of class printing_interface is: 0.933333333333
F1 value of class printing_take_printout is: 0.888888888889
F1 value of class take_from_fridge is: 0.785714285714
F1 value of class take_paper_towel is: 0.571428571429
F1 value of class take_tea_bag is: 0.941176470588
F1 value of class throw_trash is: 0.58064516129
F1 value of class use_kettle is: 0.888888888889
F1 value of class use_water_cooler is: 0.75
F1 value of class washing_up is: 0.918918918919


In [98]:
'''
Wanted to attempt unsupervised clustering, similar to what others did. For some reason they used 10 clusters,
but I think 11 make more sense because there are 11 activity classes.
'''

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=11,init='k-means++')
kmeans.fit(final_tfreq)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=11, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [102]:
pred_labels = kmeans.labels_

In [100]:
np.transpose(class_labels)

array([ 2,  2,  9,  9,  9,  2,  9,  0,  9,  9,  0,  9,  2,  3,  0,  9,  6,
        2,  3,  2,  3,  9,  9,  6,  9,  2,  2,  3,  2,  0,  9,  0,  6,  0,
        0,  9,  8,  5,  7,  9,  6,  4,  4,  9,  4,  5, 10,  4,  8,  6,  8,
       10,  4,  9, 10,  7, 10,  8,  5,  7,  7, 10,  5,  7,  8,  5,  7, 10,
        5,  5,  7,  4, 10,  7,  7,  7, 10,  5,  7,  4, 10,  8,  6, 10,  6,
        7,  8, 10,  4,  4,  6,  8,  4,  4,  8, 10,  4,  4,  4,  5, 10,  7,
       10,  4, 10, 10,  7, 10,  8,  6,  8,  4,  4,  5, 10,  6,  8,  5,  4,
        4,  7,  6,  4,  4,  5,  7,  4, 10,  6,  4,  4,  7, 10, 10,  5,  7,
        7,  4,  4,  7,  8,  7, 10,  4,  6,  8,  4,  2,  6,  4,  8,  4,  8,
       10, 10,  6,  8,  4,  4,  7,  7, 10,  2, 10,  0, 10,  5, 10,  7,  8,
        9,  0,  8, 10,  5,  7,  2, 10,  6,  8,  4,  4,  2,  3,  0,  2,  3,
        4, 10,  7, 10,  8, 10,  2,  3,  0,  8, 10,  5,  9,  9,  2,  3,  2,
        1,  3,  3, 10,  0,  9, 10,  5,  0,  7, 10,  7,  8,  5,  0,  9, 10,
        5, 10,  7,  5,  7

In [101]:
'''
This is my attempt to look at the break down of classes within each cluster, but it wasn't really a sucess because,
as you can see in the results from this cell, a lot of the clusters have a majority of 5, and two only have 5. This
ultimately makes it difficult to decipher the correct mapping when there aren't even unique cluster labels when
taking the second most frequent of them all.
'''

kmeans_labels_0 = []
kmeans_labels_1 = []
kmeans_labels_2 = []
kmeans_labels_3 = []
kmeans_labels_4 = []
kmeans_labels_5 = []
kmeans_labels_6 = []
kmeans_labels_7 = []
kmeans_labels_8 = []
kmeans_labels_9 = []
kmeans_labels_10 = []

def freq_of_label(lst):
    for i in range(0,len(np.unique(lst))):
        print('The number of labels of ' + str(np.unique(lst)[i]) + ' is ' + str(lst.count(np.unique(lst)[i])))
        
def most_common(lst):
    return 'Most common element is: ' + str(max(set(lst), key=lst.count))

for i in range(0,len(y_train)):
    if y_train[i] == 0:
        kmeans_labels_0.append(kmeans.labels_[i])
    if y_train[i] == 1:
        kmeans_labels_1.append(kmeans.labels_[i])
    if y_train[i] == 2:
        kmeans_labels_2.append(kmeans.labels_[i])
    if y_train[i] == 3:
        kmeans_labels_3.append(kmeans.labels_[i])
    if y_train[i] == 4:
        kmeans_labels_4.append(kmeans.labels_[i])
    if y_train[i] == 5:
        kmeans_labels_5.append(kmeans.labels_[i])
    if y_train[i] == 6:
        kmeans_labels_6.append(kmeans.labels_[i])
    if y_train[i] == 7:
        kmeans_labels_7.append(kmeans.labels_[i])
    if y_train[i] == 8:
        kmeans_labels_8.append(kmeans.labels_[i])
    if y_train[i] == 9:
        kmeans_labels_9.append(kmeans.labels_[i])
    if y_train[i] == 10:
        kmeans_labels_10.append(kmeans.labels_[i])
            

num_0 = y_train.count(0)
print('Number of training examples in class 0: ' + str(num_0))
print(kmeans_labels_0)
print(most_common(kmeans_labels_0))
freq_of_label(kmeans_labels_0)

print()
num_1 = y_train.count(1)
print('Number of training examples in class 1: ' + str(num_1))
print(kmeans_labels_1)  
print(most_common(kmeans_labels_1))
freq_of_label(kmeans_labels_1)

print()
num_2 = y_train.count(2)
print('Number of training examples in class 2: ' + str(num_2))
print(kmeans_labels_2) 
print(most_common(kmeans_labels_2))
freq_of_label(kmeans_labels_2)

print()
num_3 = y_train.count(3)
print('Number of training examples in class 3: ' + str(num_3))
print(kmeans_labels_3)   
print(most_common(kmeans_labels_3))
freq_of_label(kmeans_labels_3)

print()
num_4 = y_train.count(4)
print('Number of training examples in class 4: ' + str(num_4))
print(kmeans_labels_4) 
print(most_common(kmeans_labels_4))
freq_of_label(kmeans_labels_4)

print()
num_5 = y_train.count(5)
print('Number of training examples in class 5: ' + str(num_5))
print(kmeans_labels_5) 
print(most_common(kmeans_labels_5))
freq_of_label(kmeans_labels_5)

print()
num_6 = y_train.count(6)
print('Number of training examples in class 6: ' + str(num_6))
print(kmeans_labels_6)
print(most_common(kmeans_labels_6))
freq_of_label(kmeans_labels_6)

print()
num_7 = y_train.count(7)
print('Number of training examples in class 7: ' + str(num_7))
print(kmeans_labels_7) 
print(most_common(kmeans_labels_7))
freq_of_label(kmeans_labels_7)

print()
num_8 = y_train.count(8)
print('Number of training examples in class 8: ' + str(num_8))
print(kmeans_labels_8) 
print(most_common(kmeans_labels_8))
freq_of_label(kmeans_labels_8)

print()
num_9 = y_train.count(9)
print('Number of training examples in class 9: ' + str(num_9))
print(kmeans_labels_9)
print(most_common(kmeans_labels_9))
freq_of_label(kmeans_labels_9)

print()
num_10 = y_train.count(10)
print('Number of training examples in class 10: ' + str(num_10))  
print(kmeans_labels_10)   
print(most_common(kmeans_labels_10))
freq_of_label(kmeans_labels_10)


Number of training examples in class 0: 15
[4, 4, 4, 4, 10, 4, 4, 0, 4, 7, 0, 4, 7, 10, 5]
Most common element is: 4
The number of labels of 0 is 2
The number of labels of 4 is 8
The number of labels of 5 is 1
The number of labels of 7 is 2
The number of labels of 10 is 2

Number of training examples in class 1: 9
[4, 10, 10, 4, 10, 3, 4, 4, 5]
Most common element is: 4
The number of labels of 3 is 1
The number of labels of 4 is 4
The number of labels of 5 is 1
The number of labels of 10 is 3

Number of training examples in class 2: 27
[4, 4, 4, 4, 4, 4, 4, 4, 7, 6, 7, 10, 7, 0, 4, 4, 9, 7, 7, 10, 4, 0, 7, 8, 4, 10, 7]
Most common element is: 4
The number of labels of 0 is 2
The number of labels of 4 is 12
The number of labels of 6 is 1
The number of labels of 7 is 7
The number of labels of 8 is 1
The number of labels of 9 is 1
The number of labels of 10 is 3

Number of training examples in class 3: 20
[4, 4, 6, 4, 4, 4, 7, 2, 10, 8, 4, 1, 5, 4, 4, 4, 4, 10, 7, 10]
Most common element 

In [103]:
from sklearn import metrics
'''
these are five different evaluation metrics typically used for k-means or unsupervised learning paradigms
v_measure score is supposed to be the same as the normalized mutual info score but they're slightly different-
not sure why. v measure/normalized mutual info score is the measure of similarity between the two labels of the same
data, but normalized to be between 0 and 1, with 1 being a perfect correlation adn 0 being no mutual information.
mutual_info_score is just not normalized yet.

Homogeneity: if all of its clusters contain only data points which are members of a single class
Completeness: if all the data points that are members of a given class are elements of the same cluster

'''
vscore = metrics.v_measure_score(class_labels, pred_labels)
hscore = metrics.homogeneity_score(class_labels, pred_labels)
cscore = metrics.completeness_score(class_labels, pred_labels)
mscore = metrics.mutual_info_score(class_labels, pred_labels)
nscore = metrics.normalized_mutual_info_score(class_labels, pred_labels)

print(vscore)
print(hscore)
print(cscore)
print(mscore)
print(nscore)

0.414145808575
0.369038243286
0.471815891616
0.829242253373
0.417274619162


In [75]:
np.transpose(y_test)

array([ 7,  7,  7,  6,  7,  2,  9,  5,  5,  1,  9, 10,  4,  7,  8,  7,  4,
        4,  7,  7, 10,  0, 10,  0,  5,  7,  5, 10,  4,  5,  6,  8,  4,  7,
        0,  2,  2,  9, 10,  5,  2,  8, 10, 10,  6,  5,  3,  7, 10,  8,  5,
        6,  6,  9,  8,  6,  6, 10,  7,  6, 10,  7,  8,  4,  2,  5, 10,  7,
        8,  4, 10,  2,  1,  8,  4, 10,  5,  4,  3, 10,  7,  7,  7,  3,  6,
        4,  3,  8,  7, 10,  9,  4, 10,  2,  2,  4, 10, 10,  0])

In [26]:
#just wanted to take a look at what the centers are like
kmeans.cluster_centers_

array([[  2.77555756e-17,  -2.22044605e-16,  -1.11022302e-16, ...,
         -1.73472348e-18,  -1.73472348e-18,  -1.73472348e-18],
       [  4.00000000e-02,   8.00000000e-02,   4.00000000e-02, ...,
         -1.73472348e-18,  -1.73472348e-18,  -1.73472348e-18],
       [  2.77555756e-17,   0.00000000e+00,  -5.55111512e-17, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  2.77555756e-17,   1.25000000e-01,  -5.55111512e-17, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  2.77555756e-17,   0.00000000e+00,  -5.55111512e-17, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  2.14285714e-01,   2.92857143e+00,   2.85714286e-01, ...,
         -1.73472348e-18,  -1.73472348e-18,  -1.73472348e-18]])

In [104]:
'''
I'm now going to try MeanShift clustering because it doesn't necessarily assign all data points to a cluster,
and doesn't ask for a specific number of clusters as input. I think this data is probably pretty noisy 
based off the results with kmeans, so I'd like to see how different the results end up being
'''
from sklearn.cluster import MeanShift
mean_cluster = MeanShift()
mean_cluster.fit(final_tfreq)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [106]:
mc_labels = mean_cluster.labels_

In [107]:
mc_vscore = metrics.v_measure_score(class_labels, mc_labels)
mc_hscore = metrics.homogeneity_score(class_labels, mc_labels)
mc_cscore = metrics.completeness_score(class_labels, mc_labels)
mc_mscore = metrics.mutual_info_score(class_labels, mc_labels)
mc_nscore = metrics.normalized_mutual_info_score(class_labels, mc_labels)

print(mc_vscore)
print(mc_hscore)
print(mc_cscore)
print(mc_mscore)
print(mc_nscore)

0.22257772949
0.171868799082
0.31573298559
0.386195395268
0.232947738911


In [111]:
'''
It appears that meanshift actually performs significantly worse because it just doens't account for
a lot of the noise, and a large amount of the data is noisy so this actually doesn't help. I'll try affinity 
propagation clustering below which does account for the noise and assigns each data point to a cluster.
'''
from sklearn.cluster import AffinityPropagation
af_cluster = AffinityPropagation()
af_cluster.fit(final_tfreq)
af_labels = af_cluster.labels_

af_vscore = metrics.v_measure_score(class_labels, af_labels)
af_hscore = metrics.homogeneity_score(class_labels, af_labels)
af_cscore = metrics.completeness_score(class_labels, af_labels)
af_mscore = metrics.mutual_info_score(class_labels, af_labels)
af_nscore = metrics.normalized_mutual_info_score(class_labels, af_labels)

print(af_vscore)
print(af_hscore)
print(af_cscore)
print(af_mscore)
print(af_nscore)
#seems to do a much better job!

0.48200689336
0.583819239325
0.410431642691
1.31186290415
0.489507803238


In [113]:
af_labels

array([ 0, 48,  8,  1,  1, 47,  8,  1,  8,  8,  1,  1,  8, 48,  2,  8,  8,
        8, 48, 48,  8,  8,  3,  8, 58, 48,  4,  8,  8,  1,  8,  1, 14,  1,
        1,  8,  5,  6, 15,  1, 58, 15,  6,  8,  8,  6, 46,  7, 46, 58,  8,
        8,  6,  9, 46,  6, 10, 46,  6, 15,  6,  8,  6, 15,  8,  8,  6,  8,
        6,  8,  6,  6,  8, 57, 15, 15, 11, 15,  6, 17, 12,  8, 58, 13, 14,
       77, 74,  8, 15, 15, 14,  8, 15, 15,  8, 46, 16, 17,  6,  6, 18, 74,
        8, 57,  8, 19, 15, 46, 43, 14, 74, 15,  6,  6, 46, 58,  6,  8, 33,
        8, 15,  8, 15, 15,  6, 15, 15,  8, 58,  6,  6,  6, 46,  8,  6, 15,
        6, 15, 15,  6, 63, 57, 46,  6, 58, 33,  8,  8, 58, 20, 43, 21, 33,
       33, 46, 58, 63, 57, 22,  8, 57, 46, 25,  8,  8,  8,  6, 23,  8, 63,
       24, 24, 46, 46,  6,  6, 25, 46, 58, 46, 15, 45, 48, 48,  2, 25, 48,
        6,  8, 15, 46, 43, 26, 47, 48, 27,  8, 28, 46,  8, 29,  8, 48,  8,
        8,  8, 48, 46,  1,  1, 30, 45,  2,  6, 58, 15, 63, 31,  1,  1, 46,
       15,  8,  6, 15, 15