### Stage 2: App behaviour is normally distributed

    1) The activity vectors are sampled from a Gaussian distribution
    2) The clusters with normal distributions represent samples generated from a hidden Markov process
    3) To test this, we must first observe whether the data can be split into clusters with Gaussian distributions at a significance level of 0.001
    4) This test will be done via the G-Means algorithm [1]
    
[1] Hamerly, G., & Elkan, C. (n.d.). Learning the k in k -means.


In [1]:
import plotly 

In [28]:
import numpy as np
import itertools

from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.mixture import GaussianMixture

from utils import *
from gmm_means_mahalanobis2 import *
import pandas as pd

import plotly.plotly as py
import plotly.graph_objs as go

from sklearn.cluster import KMeans
import pickle
plotly.tools.set_credentials_file(username='asturkmani', api_key='bR0Ez8evmzcHY6m4XG7B')

In [3]:
# time_percentage = 0.95
# explained_variance = 0.9
# df = pd.read_csv("data/rescuetime_data-ac-min.csv")
# data_pd = Clean_DF(df)
# data_pd.clean_data(time_percentage=time_percentage)
# data_pd.clean_df = data_pd.clean_df.reset_index()
# data_pd.get_pca(explained_variance=explained_variance)
# data_pd.get_day_time()

with open('data_pd.pickle', 'rb') as f:  # Python 3: open(..., 'rb')
    data_pd = pickle.load(f)

In [4]:
data_pd.clean_df.columns.values

array(['Date', 'Time Spent (seconds)', 'Activity', 'Category',
       'Productivity', 'Activity Vector', 'Productivity Score', 'Day',
       'Time'], dtype=object)

In [5]:
# from sklearn.decomposition import KernelPCA
# kpca = KernelPCA(n_components=15, kernel='rbf', degree=5)
# kpca.fit(data_pd.activity_vector)
# pca_kernel = kpca.transform(data_pd.activity_vector)

gmm = GaussianMixture(n_components = 4)
gmm.fit(data_pd.pca_data)

gmm_labels = gmm.predict(data_pd.pca_data)

In [9]:
c = gmm_labels
x = data_pd.pca_data[:,2]
y = data_pd.pca_data[:,3]
z = data_pd.pca_data[:,4]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(gmm_labels[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis',opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='GMM PCA unscaled data')

In [7]:
from sklearn import preprocessing

X = preprocessing.scale(data_pd.pca_data)
gmm_scaled = GaussianMixture(n_components = 4)
gmm_scaled.fit(X)
gmm_scaled_labels = gmm_scaled.predict(X)

In [10]:
c = gmm_scaled_labels
x = data_pd.pca_data[:,2]
y = data_pd.pca_data[:,3]
z = data_pd.pca_data[:,4]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(gmm_labels[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis',opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='GMM PCA Scaled data')

In [11]:
data = [go.Scatter(x=data_pd.clean_df.Date, y=gmm_labels)]

py.iplot(data)

In [12]:
unique, counts = np.unique(gmm_labels, return_counts=True)
dict(zip(unique, counts))

{0: 4695, 1: 4618, 2: 2482, 3: 4909}

In [13]:
with open('tsne_results_pca.pickle', 'rb') as f:  # Python 3: open(..., 'rb')
    tsne_results_pca = pickle.load(f)

### Here GMM-Means is able to find the main clusters, but also distinguishes more between the nuances

In [18]:
gmmm_means_unscaled = GMMMeans(critical_value=1.8692, verbose=0, recalculate_points=False)
gmmm_means_unscaled.fit(data_pd.activity_vector)
print("GMM-Means with unscaled data finds: ", len(gmmm_means_unscaled.cluster_centers_)," clusters")

GMM-Means with unscaled data finds:  4  clusters


In [22]:
c = gmmm_means_unscaled.labels_
x = data_pd.pca_data[:,2]
y = data_pd.pca_data[:,3]
z = data_pd.pca_data[:,4]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(gmm_labels[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c,colorscale='Viridis',opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='GMM-Means with unscaled data')

In [20]:
gmmm_means_scaled = GMMMeans(critical_value=1.8692, verbose=0, recalculate_points=False)
gmmm_means_scaled.fit(data_pd.activity_vector)
print("GMM-Means with scaled data finds: ", len(gmmm_means_scaled.cluster_centers_)," clusters")

GMM-Means with scaled data finds:  10  clusters


In [49]:
unique, counts = np.unique(gmmm_means_scaled.labels_, return_counts=True)
dict(zip(unique, counts))

{1: 5010, 2: 2835, 3: 8858, 5: 1}

In [24]:
c = gmmm_means_scaled.labels_
x = data_pd.pca_data[:,2]
y = data_pd.pca_data[:,3]
z = data_pd.pca_data[:,4]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(gmm_labels[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis', opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='GMM-Means with scaled data')

In [36]:
gmmpp_means_unscaled = GMMMeans(critical_value=1.1692, verbose=0, recalculate_points=True)
gmmpp_means_unscaled.fit(data_pd.activity_vector)
print("GMM-Means with scaled data finds: ", len(gmmpp_means_unscaled.cluster_centers_)," clusters")


invalid value encountered in less



GMM-Means with scaled data finds:  2  clusters


In [37]:
unique, counts = np.unique(gmmpp_means_unscaled.labels_, return_counts=True)
dict(zip(unique, counts))

{0: 10899, 1: 5805}

In [38]:
c = gmmpp_means_unscaled.labels_
x = data_pd.pca_data[:,2]
y = data_pd.pca_data[:,3]
z = data_pd.pca_data[:,4]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(gmm_labels[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis', opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='GMM-Means++ with unscaled data')

In [47]:
gmmpp_means_scaled = GMMMeans(critical_value=0.5692, verbose=0, recalculate_points=True)
gmmpp_means_scaled.fit(scale(data_pd.activity_vector))
print("GMM-Means with scaled data finds: ", len(gmmpp_means_scaled.cluster_centers_)," clusters")

GMM-Means with scaled data finds:  2  clusters


In [48]:
c = gmmpp_means_scaled.labels_
x = data_pd.pca_data[:,2]
y = data_pd.pca_data[:,3]
z = data_pd.pca_data[:,4]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(gmm_labels[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis', opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='GMM-Means++ with scaled data')