# Load Data

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import random

from sklearn.datasets import load_digits
from sklearn.manifold import SpectralEmbedding, TSNE, MDS
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

# pd.options.display.max_columns
pd.set_option("display.max_colwidth",200)
pd.set_option("display.max_columns",20)
pd.set_option('float_format', '{:.3f}'.format)

In [2]:
# Read the original usage
usage = pd.read_csv('dataset/vectorized_usage.csv')

usage = usage[usage['uid']==942]
usage.drop('uid', axis=1, inplace=True)
usage.drop('Unnamed: 0', axis=1, inplace=True)

# Remove the dominating social network app usage
usage_no_social = usage[usage['app_cat'] != 4]

# One hot code the app category
usage = pd.concat([usage, pd.get_dummies(usage['app_cat'])], axis=1)
usage.drop(['app_id', 'app_cat'], axis=1, inplace=True)

usage_no_social = pd.concat([usage_no_social, pd.get_dummies(usage_no_social['app_cat'])], axis=1)
usage_no_social.drop(['app_id', 'app_cat'], axis=1, inplace=True)



# Read the original usage
rm_usage = pd.read_csv('dataset/rm_vectorized_usage.csv')
# Remove unwanted features
rm_usage.drop('uid', axis=1, inplace=True)
rm_usage.drop('Unnamed: 0', axis=1, inplace=True)
# Remove the dominating social network app usage
rm_usage_no_social = rm_usage[rm_usage['app_cat'] != 4]


print("Checking the shape of of the files")
print("-"*30)
print("usage:", usage.shape)
print("usage_no_social", usage_no_social.shape)
print()
print("rm_usage:", rm_usage.shape)
print("rm_usage_no_social", rm_usage_no_social.shape)

Checking the shape of of the files
------------------------------
usage: (1098748, 45)
usage_no_social (577817, 44)

rm_usage: (4171949, 27)
rm_usage_no_social (2316044, 27)


# PCA

In [3]:
# normalizing the data
from sklearn.preprocessing import StandardScaler
usage = StandardScaler().fit_transform(usage)
rm_usage = StandardScaler().fit_transform(rm_usage)
usage_no_social = StandardScaler().fit_transform(usage_no_social)

In [None]:
# table to visualize features
feat_cols = ['feature'+str(i) for i in range(usage.shape[1])]
feat_cols2 = ['feature'+str(i) for i in range(rm_usage.shape[1])]
normalized_usage_table = pd.DataFrame(usage, columns = feat_cols)

normalized_usage_table

In [None]:
def show_PCA(df, sample):    
    
    # Run PCA on the original usage data
    pca = PCA(n_components=15)
    transformed = pca.fit_transform(df)

    print("Finish PCA")
    print("-"*30)
    print(pca.explained_variance_ratio_)

    # Get the cumulative values of the explained variance
    cumulative = np.cumsum(pca.explained_variance_ratio_)

    # Create the subplot
    fig, axs = plt.subplots(1, 2, figsize=(16,3))

    # plot the cumulative function
    axs[0].plot(range(len(cumulative)), cumulative)

    # Random sample a set of data
    sample_usage = random.sample(range(df.shape[0]), sample)

    # Plot the data after dimension reduction
    axs[1].scatter(transformed.T[0][sample_usage], transformed.T[1][sample_usage])

    # Decorate the plots
    plt.suptitle('PCA')
    plt.show()

In [None]:
show_PCA(usage, min(usage.shape[0], 50000))

In [None]:
show_PCA(usage_no_social, min(usage.shape[0], 50000))

In [None]:
show_PCA(rm_usage, min(usage.shape[0], 50000))

In [4]:
from sklearn.decomposition import PCA

# PCA Analysis Usage
pca = PCA(n_components = 15)
usage_pca = pca.fit_transform(usage)

In [5]:
# PCA Analysis RM Usage
pca_rm = PCA(n_components = 15)
rm_usage_pca = pca_rm.fit_transform(rm_usage)

In [6]:
usage_PC = pd.DataFrame(data = usage_pca)
usage_PC.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.033,-0.054,-0.17,-0.999,1.095,-0.448,3.481,-3.469,2.017,0.152,-0.015,1.617,0.95,0.301,-0.866
1,0.033,-0.05,-0.176,-0.996,1.099,-0.444,3.514,-3.461,2.035,0.184,0.018,1.63,0.957,0.318,-0.797
2,0.033,-0.05,-0.176,-0.996,1.099,-0.444,3.514,-3.461,2.035,0.184,0.018,1.63,0.957,0.318,-0.797
3,0.033,-0.052,-0.173,-0.997,1.097,-0.446,3.497,-3.465,2.027,0.167,0.001,1.622,0.953,0.31,-0.835
4,0.033,-0.051,-0.174,-0.993,1.096,-0.446,3.503,-3.465,2.036,0.171,0.006,1.618,0.955,0.316,-0.834


In [7]:
rm_usage_pc = pd.DataFrame(data = rm_usage_pca)
rm_usage_pc.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-0.216,3.92,-0.729,1.107,-0.302,0.62,-0.536,-0.837,-0.789,0.078,0.079,-0.998,-0.592,0.824,-0.282
1,-0.215,3.926,-0.728,1.105,-0.294,0.626,-0.57,-0.751,-0.744,0.098,0.075,-1.098,-0.602,0.82,-0.28
2,-0.208,4.032,-0.771,1.112,-0.207,0.646,-1.167,0.458,-0.253,-0.014,0.015,0.216,-0.389,0.846,-0.299
3,-0.208,4.032,-0.771,1.112,-0.207,0.646,-1.166,0.46,-0.255,-0.015,0.026,0.216,-0.389,0.846,-0.299
4,-0.216,-0.169,-0.499,1.47,0.054,0.818,-1.514,1.438,-0.41,0.008,-0.177,0.776,0.055,1.1,0.116


In [None]:
print('Variance explanation per PC: {}'.format(pca.explained_variance_ratio_))

In [None]:
import matplotlib.pyplot as plt

# plot PCA visualization
plt.scatter(usage_pca.T[0], usage_pca.T[1])
plt.title('PCA Analysis')
plt.show

# DBSCAN Clustering

In [8]:
from sklearn import metrics
from sklearn.cluster import DBSCAN

In [10]:
usage_clust = DBSCAN(eps=0.2, min_samples = 10).fit(usage_pca)

#Storing the labels formed by the DBSCAN
labels = usage_clust.labels_
# measure the performance of dbscan algo
#Identifying which points make up our “core points”
core_samples = np.zeros_like(labels, dtype=bool)
core_samples[usage_clust.core_sample_indices_] = True
print(core_samples)
#Calculating "the number of clusters"
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print(n_clusters_)
#Computing "the Silhouette Score"
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(usage_pca, labels))

[ True  True  True ...  True  True  True]
8374
Silhouette Coefficient: 0.411
