### Stage 2: App behaviour is normally distributed

    1) The activity vectors are sampled from a Gaussian distribution
    2) The clusters with normal distributions represent samples generated from a hidden Markov process
    3) To test this, we must first observe whether the data can be split into clusters with Gaussian distributions at a significance level of 0.001
    4) This test will be done via the G-Means algorithm [1]
    
[1] Hamerly, G., & Elkan, C. (n.d.). Learning the k in k -means.


In [1]:
import plotly 

In [2]:
import numpy as np
import itertools

from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.mixture import GaussianMixture

from utils import *
import pandas as pd

import plotly.plotly as py
import plotly.graph_objs as go

from sklearn.cluster import KMeans
import pickle

In [3]:
# time_percentage = 0.95
# explained_variance = 0.9
# df = pd.read_csv("data/rescuetime_data-ac-min.csv")
# data_pd = Clean_DF(df)
# data_pd.clean_data(time_percentage=time_percentage)
# data_pd.clean_df = data_pd.clean_df.reset_index()
# data_pd.get_pca(explained_variance=explained_variance)
# data_pd.get_day_time()

with open('data_pd.pickle', 'rb') as f:  # Python 3: open(..., 'rb')
    data_pd = pickle.load(f)

In [4]:
data_pd.clean_df.columns.values

array(['Date', 'Time Spent (seconds)', 'Activity', 'Category',
       'Productivity', 'Activity Vector', 'Productivity Score', 'Day',
       'Time'], dtype=object)

In [11]:
# from sklearn.decomposition import KernelPCA
# kpca = KernelPCA(n_components=15, kernel='rbf', degree=5)
# kpca.fit(data_pd.activity_vector)
# pca_kernel = kpca.transform(data_pd.activity_vector)

gmm = GaussianMixture(n_components = 4)
gmm.fit(data_pd.pca_data)

gmm_labels = gmm.predict(data_pd.pca_data)

In [12]:
c = gmm_labels
x = data_pd.pca_data[:,0]
y = data_pd.pca_data[:,1]
z = data_pd.pca_data[:,2]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(gmm_labels[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis',opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='GMM visualisation through PCA data')

In [9]:
from sklearn import preprocessing
gmm = GaussianMixture(n_components = 4)
gmm.fit(data_pd.pca_data)

X = preprocessing.scale(data_pd.pca_data)
gmm_labels = gmm.predict(X)


In [10]:
c = gmm_labels
x = data_pd.pca_data[:,0]
y = data_pd.pca_data[:,1]
z = data_pd.pca_data[:,2]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(gmm_labels[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis',opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Normalized')

In [13]:
data = [go.Scatter(x=data_pd.clean_df.Date, y=gmm_labels)]

py.iplot(data)

In [14]:
unique, counts = np.unique(gmm_labels, return_counts=True)
dict(zip(unique, counts))

{0: 3524, 1: 6421, 2: 2652, 3: 4107}

In [173]:
c = gmm_labels
x = data_pd.pca_data[:,6]
y = data_pd.pca_data[:,5]
z = data_pd.pca_data[:,4]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(gmm_labels[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c,opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='PCA-654 Visualization with G-Means')

In [80]:
c = gmm_labels2
x = data_pd.pca_data[:,0]
y = data_pd.pca_data[:,1]
z = data_pd.pca_data[:,2]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(gmm_labels[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c,opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='PCA-654 Visualization with G-Means')

In [86]:
c = gmm_labels2
x = data_pd.pca_data[:,8]
y = data_pd.pca_data[:,3]
z = data_pd.pca_data[:,6]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(gmm_labels[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c,opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='PCA-654 Visualization with G-Means')

In [25]:
import pickle

# # obj0, obj1, obj2 are created here...

# Saving the objects:
# with open('t-sne30_2D.pickle', 'wb') as f:  # Python 3: open(..., 'wb')
#     pickle.dump(tsne_results_pca, f)

# # Getting back the objects:
with open('t-sne30_2D.pickle', 'rb') as f:  # Python 3: open(..., 'rb')
    tsne_results_pca_2D = pickle.load(f)


In [28]:
c = gmm_labels
x = tsne_results_pca[:,0]
y = tsne_results_pca[:,1]
z = tsne_results_pca[:,2]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]

trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis',opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='t-SNE PCA 90% variance Visualization')

High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~AbdelWahabTurkmani/0 or inside your plot.ly account where it is named 't-SNE PCA 90% variance Visualization'


In [31]:
c = gmm_labels
x = tsne_results_pca_2D[:,0]
y = tsne_results_pca_2D[:,1]
# z = tsne_results_pca[:,2]
# t = data_pd.clean_df['Activity']
# t = data_pd.clean_df['Activity'].tolist()
# t = ['-'.join(x) for x in t]

trace1 = go.Scatter(x=x,y=y,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis',opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='t-SNE PCA 90% variance Visualization')

High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~AbdelWahabTurkmani/0 or inside your plot.ly account where it is named 't-SNE PCA 90% variance Visualization'
