In [None]:
from pyspark.sql import Row, DataFrame, HiveContext
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer

sqlContext_H = HiveContext(sc)

from scipy.sparse import lil_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 250)

import numpy as np
import copy
from collections import OrderedDict

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

---

---

# Factorization

### Read data of the 7th month

In [None]:
fields_list = ( "ACCESS_DTTM", "MODULE", "USER_ID" )
data = sqlContext_H.read.parquet( HOST + "/parquet2/07/*" ).select(*fields_list).persist()
sqlContext_H.registerDataFrameAsTable(data, 'data')    
data.limit(5).toPandas()

In [None]:
%%time

# 1. Count all unique users of the defined month
total_users = sc.textFile(HOST + "/csv/unique_users_7.csv").count()


# 2. Calculate tf
query = lambda total_users: """
    SELECT T1.USER_ID, T1.MODULE, T1.AMOUNT*1.0 / T2.TOTAL AS TF 
    FROM (SELECT USER_ID, MODULE, COUNT(ACCESS_DTTM) AS AMOUNT
        FROM data
        GROUP BY USER_ID, MODULE
    ) AS T1
    JOIN (SELECT USER_ID, COUNT(ACCESS_DTTM) AS TOTAL
        FROM data
        GROUP BY USER_ID
    ) AS T2
    ON T1.USER_ID = T2.USER_ID
""".format(total_users=total_users)

res = sqlContext_H.sql(query(total_users)).persist()
# 3. Encode a string column of labels to a column of label indices
all_users = sqlContext_H.createDataFrame( 
        sc.textFile(HOST + "/csv/unique_users.csv").map(lambda p: Row(USER_ID=p)) 
    ).union( sqlContext_H.createDataFrame([Row(USER_ID='')]) ).persist()

try:
    all_modules = sqlContext_H.createDataFrame( 
            sc.textFile(HOST + "/csv/unique_modules.csv").map(lambda p: Row(MODULE=p)) 
        ).union( sqlContext_H.createDataFrame([Row(MODULE='')]) ).persist()
except:
    all_modules = sqlContext_H.read.parquet( 
            HOST + "/parquet2/07/*",
            HOST + "/parquet2/08/*",
            HOST + "/parquet2/09/*"
        ).select("MODULE").distinct().persist()
    all_modules.write.format("com.databricks.spark.csv").mode('overwrite').save(HOST + "/csv/unique_modules.csv")
    all_modules = all_modules.union( sqlContext_H.createDataFrame([Row(MODULE='')]) )
    print "MODULEs = ", all_modules.count()

indexerU = StringIndexer(inputCol="USER_ID", outputCol="USER_ID_Index").fit(all_users)
indexerM = StringIndexer(inputCol="MODULE", outputCol="MODULE_Index").fit(all_modules)

all_users.unpersist()
all_modules.unpersist()

indexedU_df = indexerU.transform(res).withColumn(
        "USER_ID_Index", F.col("USER_ID_Index").cast(IntegerType())
    ).persist()

table = indexerM.transform(indexedU_df).withColumn(
        "MODULE_Index", F.col("MODULE_Index").cast(IntegerType())
    ).persist()

res.unpersist()
indexedU_df.unpersist()

# SVD

In [None]:
%%time 

t = table.select("USER_ID_Index", "MODULE_Index", "TF").toPandas()

total_users = sc.textFile(HOST + "/csv/unique_users.csv").count()
total_modules = sc.textFile(HOST + "/csv/unique_modules.csv").count()

def sparse_df_to_array(df, shape):
    """ Convert sparse dataframe to sparse array csr_matrix used by scikit learn. """
    arr = lil_matrix(shape, dtype=np.float32)
    for i in range(df.shape[0]):
        arr[df.ix[i, "USER_ID_Index"]-1, df.ix[i, "MODULE_Index"]-1] = df.ix[i, "TF"]
    return arr.tocsr()

m = sparse_df_to_array(t, (total_users, total_modules))

In [None]:
%%time 
for i in range(1, total_modules):
    svd = TruncatedSVD(n_components=i, n_iter=25, random_state=42)
    svd.fit(m) 
    print i, "svd.explained_variance_ratio_.sum() =", svd.explained_variance_ratio_.sum()

In [None]:
svd = TruncatedSVD(n_components=10, n_iter=25, random_state=42)
svd.fit(m) 
print "svd.explained_variance_ratio_.sum() =", svd.explained_variance_ratio_.sum()

main1 = svd.transform(m)
main2 = svd.inverse_transform(main1)

m_idx = np.where(m.todense() > 0)
m_not_0 = m.todense()[m_idx].T
main2_not_0 = main2[m_idx].reshape(m_not_0.shape[0], 1)
print "RMSE:", np.sqrt(np.square(m_not_0 - main2_not_0).mean())

### Caclulate cosine similarities

Prepare dataframes for each day from the 8th month

In [None]:
%%time

query = """
    SELECT T1.USER_ID, MODULE, T1.AMOUNT*1.0 / T2.TOTAL AS TF 
    FROM (SELECT USER_ID, MODULE, COUNT(ACCESS_DTTM) AS AMOUNT
        FROM df
        GROUP BY USER_ID, MODULE
    ) AS T1
    JOIN (SELECT USER_ID, COUNT(ACCESS_DTTM) AS TOTAL
        FROM df
        GROUP BY USER_ID
    ) AS T2
    ON T1.USER_ID = T2.USER_ID
"""

data_8_9 = sqlContext_H.read.parquet( 
        HOST + "/parquet2/08/*",
        HOST + "/parquet2/09/*"
    ).persist()
sqlContext_H.registerDataFrameAsTable(data_8_9, 'data_8_9')

df_names = []
for w in range(31, 39):
    try: sqlContext_H.dropTempTable("df")
    except: pass
    print w
    df = data_8_9.filter("ACCESS_WEEK = {}".format(w)).select(*fields_list).persist()
    sqlContext_H.registerDataFrameAsTable(df, 'df')
    df_names.append('df_table_' + str(w))
    globals()['df_table_' + str(w)] = sqlContext_H.sql(query).persist()
    df.unpersist()

In [None]:
%%time

cos = {}

for dn in df_names:
    d = int(dn.split("_")[-1])
    df_indexedU = indexerU.transform(globals()[dn]).withColumn(
                "USER_ID_Index", F.col("USER_ID_Index").cast(IntegerType())
            ).persist()
    df_table = indexerM.transform(df_indexedU).withColumn(
            "MODULE_Index", F.col("MODULE_Index").cast(IntegerType())
        ).select( "USER_ID_Index", "MODULE_Index", "TF" ).toPandas()
    df_indexedU.unpersist()

    df_m = sparse_df_to_array(df_table, (total_users, total_modules)).todense()

    print dn
    x1 = svd.transform(df_m)
    
    vals = []
    for i in range(df_m.shape[0]):
        vals.append(cosine_similarity(main1[i].reshape(1, -1), x1[i].reshape(1, -1))[0][0])    
    cos.update({d:vals})

# Investigate the behaviour of users with different values of cosine similarities

In [None]:
def get_user_index_by_id(id):
    return indexerU.labels.index(str(id)) - 1

def get_user_id_by_index(idx):
    return indexerU.labels[idx + 1]

In [None]:
user_indices = set()
for k,v in cos.iteritems():
    v = np.array(v)
    user_indices |= set(np.where((v > 0.2) & (v < 0.8))[0])
    
print len(user_indices)
print list(user_indices)[:25]

In [None]:
%%time

colors = (
    'r', 'b', 'g', 'm', 'y', 'c', 'indigo', 'gold', 'pink', 'palegreen', 
    'grey', 'lime', 'darkred', 'salmon', 'navy', 'brown', 'coral'
) * 2

def draw3D(user_id, table, show=False, show3D=True):
    total_modules = sc.textFile(HOST + "/csv/unique_modules.csv").collect()
    total_modules.append("")
    default = OrderedDict({i:0 for i in total_modules})
    plot_data = OrderedDict({})
    
    u_table = table.filter("USER_ID = {}".format(user_id)).persist()
    all_data = list(map(lambda x: (x[0],x[1]), u_table.select("MODULE","TF").collect()))
    if show: print "07 months workstations usage:\n", all_data
    dict_i = copy.deepcopy(default)
    for workst_i, tfidf_i in all_data:
        dict_i[workst_i] = tfidf_i
    plot_data.update({0: dict_i})  
    u_table.unpersist()
    
    for d in range(31, 39):
        d_data = list(map(lambda x: (x[0],x[1]), 
                globals()['df_table_' + str(d)].filter("USER_ID = {}".format(user_id)).select("MODULE","TF").collect()
            ))
        if show: print "workstations usage for {} day:".format(d), d_data
        dict_d = copy.deepcopy(default)
        for workst_i, tfidf_i in d_data:
            dict_d[workst_i] = tfidf_i
        plot_data.update({d: dict_d}) 
        
    if show3D:
        fig = plt.figure(figsize=(16, 16))
        ax = fig.add_subplot(111, projection='3d')
        Y = range(len(total_modules))
        c = 0
        for key, val in plot_data.iteritems():
            ax.bar(Y, np.array(val.values()), zs=c, zdir='y', color=colors[c], alpha=0.7)
            c += 1
        ax.set_xlabel('MODULE')
        plt.xticks(Y, Y)
        ax.set_ylabel('Week')
        plt.yticks(range(len(range(31, 39)) + 1), [0] + list(range(31, 39)))
        ax.set_zlabel('TF-IDF')
        plt.show()
    
    fig = plt.figure(figsize=(16,4))
    axes = plt.gca()
    cos_sims_factors = [cos[i][idx] for i in range(31, 39)]
    cos_sims_tfidf = [
        cosine_similarity(np.array(plot_data[0].values()).reshape(1, -1), np.array(vals.values()).reshape(1, -1))[0][0] 
        for k,vals in plot_data.iteritems() if k!=0
    ]
    X = np.arange(len(cos_sims_tfidf))
    plt.bar(X + 0.05, cos_sims_factors, width=0.9, alpha=0.7, color='r', label="Factors")
    plt.bar(X + 0.15, cos_sims_tfidf, width=0.7, alpha=0.7, color='g', label="TF-IDF")
    plt.xlabel("Weeks", fontsize=12)
    plt.ylabel('Cosine similarity', fontsize=12)
    plt.xticks(range(len(range(31, 39))), list(range(31, 39)))
    plt.title("Cosine similarities", fontsize=12)
    plt.legend()
    plt.grid(True)
    plt.show()
    
all_workstations = sc.textFile(HOST + "/csv/unique_workstations.csv").collect()
all_workstations.append("")

In [None]:
idx = 4
user_id = get_user_id_by_index(idx)
print "user_id =", user_id

draw3D(user_id, table, show=False)

In [None]:
%%time

idx = 4105
user_id = get_user_id_by_index(idx)
print "user_id =", user_id
    
draw3D(user_id, table, show=False)

In [None]:
%%time

idx = 14
user_id = get_user_id_by_index(idx)
print "user_id =", user_id
    
draw3D(user_id, table, show=False)

In [None]:
%%time

idx = 15
user_id = get_user_id_by_index(idx)
print "user_id =", user_id
    
draw3D(user_id, table, show=False)

# Another amount of main components

In [None]:
%%time

svd = TruncatedSVD(n_components=4, n_iter=25, random_state=42)
svd.fit(m) 
print "svd.explained_variance_ratio_.sum() =", svd.explained_variance_ratio_.sum()

main1 = svd.transform(m)
main2 = svd.inverse_transform(main1)

m_idx = np.where(m.todense() > 0)
m_not_0 = m.todense()[m_idx].T
main2_not_0 = main2[m_idx].reshape(m_not_0.shape[0], 1)
print "RMSE:", np.sqrt(np.square(m_not_0 - main2_not_0).mean())

cos = {}

for dn in df_names:
    d = int(dn.split("_")[-1])
    df_indexedU = indexerU.transform(globals()[dn]).withColumn(
                "USER_ID_Index", F.col("USER_ID_Index").cast(IntegerType())
            ).persist()
    df_table = indexerM.transform(df_indexedU).withColumn(
            "MODULE_Index", F.col("MODULE_Index").cast(IntegerType())
        ).select( "USER_ID_Index", "MODULE_Index", "TF" ).toPandas()
    df_indexedU.unpersist()

    df_m = sparse_df_to_array(df_table, (total_users, total_modules)).todense()

    print dn
    x1 = svd.transform(df_m)
    
    vals = []
    for i in range(df_m.shape[0]):
        vals.append(cosine_similarity(main1[i].reshape(1, -1), x1[i].reshape(1, -1))[0][0])    
    cos.update({d:vals})

In [None]:
idx = 4
user_id = get_user_id_by_index(idx)
print "user_id =", user_id

draw3D(user_id, table, show3D=False)

In [None]:
idx = 14
user_id = get_user_id_by_index(idx)
print "user_id =", user_id

draw3D(user_id, table, show3D=False)

In [None]:
idx = 15
user_id = get_user_id_by_index(idx)
print "user_id =", user_id

draw3D(user_id, table, show3D=False)

In [None]:
idx = 44
user_id = get_user_id_by_index(idx)
print "user_id =", user_id

draw3D(user_id, table, show3D=False)