## Goal

#### This thesis will aim to examine temporal app-usage data and achieve two goals:       
    1) Model the data by clustering groups of activities into states        
    2) Make predictions on the next state given the current state  

To do this, a baseline model will first be created, and subsequently improved models will be built that aim to surpass the quality of modelling and prediction       
        
##### Baseline:      
    1) Dimensionality reduction with PCA: Done
    2) HMM for next-state/app usage prediction
    
    
#### Advanced model:
    1) RNN
    2) Sequence-to-sequence RNN
    3) S-to-S RNN with attention
    4) VAE for dimensionality reduction followed by RNN for prediction

### Get cleaned data

In [1]:
from importlib import reload
from utils import *
import pandas as pd

time_percentage = 0.9
explained_variance = 0.9
df = pd.read_csv("data/rescuetime_data-ac-min.csv")
data_pd = Clean_DF(df)
data_pd.clean_data(time_percentage=time_percentage)
data_pd.clean_df = data_pd.clean_df.reset_index()
data_pd.get_pca(explained_variance=explained_variance)
data_pd.get_day_time()

In [5]:
print("Dataset size:", data_pd.clean_df.shape,'\n')
print("Number of apps that consume", time_percentage*100, "% of all users time: ",len(data_pd.popular_apps), '\n')
print("Cleaned dataset columns:",'\n', data_pd.clean_df.columns.values, '\n')
print("Number of components that explain", explained_variance*100,"% of the data: ",data_pd.pca_data.shape[1], '\n')

Dataset size: (10983, 9) 

Number of apps that consume 90.0 % of all users time:  91 

Cleaned dataset columns: 
 ['Date' 'Time Spent (seconds)' 'Activity' 'Category' 'Productivity'
 'Activity Vector' 'Productivity Score' 'Day' 'Time'] 

Number of components that explain 90.0 % of the data:  31 



### Compute Principal Components and visualize top-3 modes

In [6]:
import plotly.plotly as py
import plotly.graph_objs as go

data_pd.get_pca(explained_variance=explained_variance)

c = data_pd.clean_df['Productivity Score']
x = data_pd.pca_data[:,0]
y = data_pd.pca_data[:,1]
z = data_pd.pca_data[:,2]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]

trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='RdYlGn',opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='PCA-3 Visualization')

### Visualize entire PCA dimensional space using t-SNE

In [7]:
from sklearn.manifold import TSNE
tsne30 = TSNE(n_components=3, verbose=0, perplexity=30, n_iter=5000)
tsne_results30 = tsne30.fit_transform(data_pd.pca_data)

In [8]:
c = data_pd.clean_df['Productivity Score']
x = tsne_results30[:,0]
y = tsne_results30[:,1]
z = tsne_results30[:,2]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]

trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='RdYlGn',opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='t-SNE PCA 90% variance Visualization')

## Clustering data in PCA dimensions

### Clustering data using the elbow method

    1) Run Kmeans for N iterations recording the inertia for each
    2) At each iteration save the error
    3) When viewing the errors, pick the iteration that is at the elbow of the error curve


In [9]:
from sklearn.cluster import KMeans
inertia = np.zeros(25)
for i in range(2,26):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(data_pd.pca_data)
    inertia[i-2] = kmeans.inertia_

In [10]:
trace = go.Scatter(x=[i for i in range(0,25)], y=inertia, mode='markers')
data= [trace]
py.iplot(data, filename='K-Means inerta')

### Visualize top-3 PCA data with k-Means labels

In [16]:
kmeans = KMeans(n_clusters=9)
kmeans.fit(data_pd.pca_data)
print(set(kmeans.labels_))
for i in range(0,9):
    print(list(kmeans.labels_).count(i))

{0, 1, 2, 3, 4, 5, 6, 7, 8}
3113
4854
276
507
993
373
258
313
296


In [17]:
c = kmeans.labels_
x = data_pd.pca_data[:,0]
y = data_pd.pca_data[:,1]
z = data_pd.pca_data[:,2]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(kmeans.labels_[i]) + '---' + t[i]
    
trace1 = go.Scatter3d(x=x,y=y,z=z,text=t,mode='markers',marker=dict(size=12,color=c, colorscale='Viridis',opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='PCA-3 Visualization with k-Means')

### Visualize PCA data using t-SNE  with k-Means labels

In [18]:
c = kmeans.labels_
x = tsne_results30[:,0]
y = tsne_results30[:,1]
z = tsne_results30[:,2]
t = data_pd.clean_df['Activity']
t = data_pd.clean_df['Activity'].tolist()
t = ['-'.join(x) for x in t]
for i in range(0,len(t)):
    t[i] = str(kmeans.labels_[i]) + '---' + t[i]

trace1 = go.Scatter3d(x=x,y=y,z=z,text=t, mode='markers',marker=dict(size=12,color=c, colorscale = 'Viridis', opacity=0.8))
data = [trace1]
layout = go.Layout(margin=dict(l=0,r=0,b=0,t=0))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='t-SNE PCA with k-Means')


## G-Means clustering

In [2]:
from gmeans import *
from scipy.cluster.vq import whiten
whitened = whiten(data_pd.pca_data)
# print(whitened)
gmeans = GMeans(min_obs=100,
    max_depth=500,
    random_state=1010,
    strictness=3)
gmeans.fit(data_pd.pca_data)
gmeans.labels_

  w = (y - xbar) / s
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = (x >= self.b) & cond0
  cond2 = cond0 & (x <= self.a)


array([65142166818, 65142166818, 71689898406, ..., 87124497404,
       12376507826, 22562532922])

In [None]:
# print(data_pd.clean_df['Day'], data_pd.clean_df['Time'], pd.Series(c))