In [None]:
from pyspark.sql import Row, DataFrame, HiveContext
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer

sqlContext_H = HiveContext(sc)

from scipy.sparse import lil_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 250)

import numpy as np
import copy
from collections import OrderedDict
from dateutil.parser import parse

from matplotlib.collections import LineCollection
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

In [None]:
secure_rdd = sc.textFile(HOST + "/users_info/SecureHealEmpTest.csv").persist()
first = secure_rdd.first()
header = first.split("|")
row_data = secure_rdd.filter(lambda x: x != first).map( lambda x: x.split("|") ) \
                        .map( lambda p: Row(**{header[i]:p[i] for i in range(len(header))}) ).persist()
secure_rdd.unpersist() 
users = sqlContext_H.createDataFrame(row_data).select("Employee", "ProcDesc", "DeptName", "Dept").persist()
row_data.unpersist()
sqlContext_H.registerDataFrameAsTable(users, "users")

In [None]:
users.limit(2).toPandas()

In [None]:
fields_list = ( "ACCESS_DTTM", "USER_ID", "WORKSTATION", "ACCESS_WEEK" )
data = sqlContext_H.read.parquet( 
        HOST + "/parquet3/08/*"
    ).select(*fields_list).persist()

sqlContext_H.registerDataFrameAsTable(data, 'data')

In [None]:
data.limit(5).toPandas()

In [None]:
%%time
data_f = data.filter("ACCESS_WEEK = '31'")
sqlContext_H.registerDataFrameAsTable(data_f, 'data_f')

In [None]:
query = lambda table: """
    SELECT mainT1.USER_ID, mainT1.WORKSTATION, mainT1.TF * mainT2.IDF AS TF_IDF
    FROM (SELECT T1.USER_ID, T1.WORKSTATION, T1.AMOUNT*1.0 / T2.TOTAL AS TF 
        FROM (SELECT USER_ID, WORKSTATION, COUNT(ACCESS_DTTM) AS AMOUNT
            FROM {table_name}
            GROUP BY USER_ID, WORKSTATION
        ) AS T1
        JOIN (SELECT USER_ID, COUNT(ACCESS_DTTM) AS TOTAL
            FROM {table_name}
            GROUP BY USER_ID
        ) AS T2
        ON T1.USER_ID = T2.USER_ID
    ) AS mainT1 JOIN (
        SELECT T.USER_ID, 
               log10( 1 + (SELECT COUNT(DISTINCT(USER_ID)) AS count FROM {table_name}) * 1.0 / COUNT(T.WORKSTATION) ) AS IDF
        FROM (SELECT USER_ID, WORKSTATION
            FROM {table_name}
            GROUP BY USER_ID, WORKSTATION
        ) AS T
        GROUP BY T.USER_ID
    ) AS mainT2 ON mainT1.USER_ID = mainT2.USER_ID
""".format(table_name=table)

df = sqlContext_H.sql(query("data_f")).persist()
sqlContext_H.registerDataFrameAsTable(df, 'df')

query = lambda table: " ".join([
                "SELECT t.USER_ID, t.WORKSTATION, t.TF_IDF",
                ", users.DeptName AS DEPT_NAME, users.Dept AS DEPT_ID, users.ProcDesc AS HOSPITAL",
                        "FROM {table_name} AS t",
                        "LEFT JOIN users ON t.USER_ID = users.Employee",
                    ]).format(table_name=table)

res = sqlContext_H.sql(query('df')).persist()

In [None]:
%%time
res.limit(5).toPandas()
#res.filter("DEPT_ID IS NULL").limit(10).toPandas()

# ...

---
# One day

In [None]:
%%time

seed = np.random.RandomState(seed=3)
similarities = euclidean_distances(main1)

mds = manifold.MDS(n_components=2, max_iter=300, eps=1e-9, random_state=seed,
                   dissimilarity="precomputed", n_jobs=1)
pos = mds.fit(similarities).embedding_

nmds = manifold.MDS(n_components=2, metric=False, max_iter=300, eps=1e-12,
                    dissimilarity="precomputed", random_state=seed, n_jobs=1,
                    n_init=1)
npos = nmds.fit_transform(similarities, init=pos)

X_true = main1

# Rescale the data
pos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((pos ** 2).sum())
npos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((npos ** 2).sum())

# Rotate the data
clf = PCA(n_components=2)
X_true = clf.fit_transform(X_true)

pos = clf.fit_transform(pos)

npos = clf.fit_transform(npos)

#fig = plt.figure(1)
fig = plt.figure(figsize=(14, 14))
ax = plt.axes([0., 0., 1., 1.])

s = 200

max_ind = 0 
for ind, val in enumerate(zip(npos[:, 0], npos[:, 1])):
    for k, v in dict_with_dept.items():
        if ind in v[-1] and k == None:
            plt.scatter(val[0], val[1], color=colors[v[0]], s=s, lw=0, label="None")


similarities = similarities.max() / similarities * 100
similarities[np.isinf(similarities)] = 0

segments = [[X_true[i, :], X_true[j, :]]
            for i in range(len(pos)) for j in range(len(pos))]
values = np.abs(similarities)
lc = LineCollection(segments,
                    zorder=0, cmap=plt.cm.Blues,
                    norm=plt.Normalize(0, values.max()))
lc.set_array(similarities.flatten())
lc.set_linewidths(0 * np.ones(len(segments)))
ax.add_collection(lc)

plt.show()

In [None]:
%%time
### 5-10
seed = np.random.RandomState(seed=3)
similarities = euclidean_distances(main1)

mds = manifold.MDS(n_components=2, max_iter=300, eps=1e-9, random_state=seed,
                   dissimilarity="precomputed", n_jobs=1)
pos = mds.fit(similarities).embedding_

nmds = manifold.MDS(n_components=2, metric=False, max_iter=300, eps=1e-12,
                    dissimilarity="precomputed", random_state=seed, n_jobs=1,
                    n_init=1)
npos = nmds.fit_transform(similarities, init=pos)

X_true = main1

# Rescale the data
pos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((pos ** 2).sum())
npos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((npos ** 2).sum())

# Rotate the data
clf = PCA(n_components=2)
X_true = clf.fit_transform(X_true)

pos = clf.fit_transform(pos)

npos = clf.fit_transform(npos)

#fig = plt.figure(1)
fig = plt.figure(figsize=(14, 14))
ax = plt.axes([0., 0., 1., 1.])

s = 200
max_ind = 0 
lables_d = {}
for ind, val in enumerate(zip(npos[:, 0], npos[:, 1])):
    for k, v in dict_with_dept.items():
        if ind in v[-1] and k != None and (5 <= len(v[-1]) <= 10):
            #print(colors[v[0]], k)
            lables_d[k] = str(ind) + '_plt'
            globals()[str(ind) + '_plt'] = plt.scatter(val[0], val[1], color=colors[v[0]], s=s, lw=0, label=k)

l_colors = []
l_lable = []
for ind_l, val_l in lables_d.items():
    l_colors.append(globals()[val_l])
    l_lable.append(ind_l)
plt.legend(l_colors, l_lable, scatterpoints=1, loc='best', shadow=False)

similarities = similarities.max() / similarities * 100
similarities[np.isinf(similarities)] = 0


segments = [[X_true[i, :], X_true[j, :]]
            for i in range(len(pos)) for j in range(len(pos))]
values = np.abs(similarities)
lc = LineCollection(segments,
                    zorder=0, cmap=plt.cm.Blues,
                    norm=plt.Normalize(0, values.max()))
lc.set_array(similarities.flatten())
lc.set_linewidths(0 * np.ones(len(segments)))
ax.add_collection(lc)

plt.show()