In [None]:
from pyspark.sql import Row, DataFrame, HiveContext
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer

sqlContext_H = HiveContext(sc)

from scipy.sparse import lil_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 250)

import numpy as np
import copy
from collections import OrderedDict

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

In [None]:
%%time

fields_list = ( "ACCESS_DTTM", "METRIC_ID", "USER_ID" )
data = sqlContext_H.read.parquet( HOST + "/parquet2/07/*" ).select(*fields_list).persist()
sqlContext_H.registerDataFrameAsTable(data, 'data') 

# 1. Count all unique users of the defined month
total_users = sc.textFile(HOST + "/csv/unique_users_7.csv").count()

# 2. Calculate tf
query = lambda total_users: """
    SELECT T1.USER_ID, T1.METRIC_ID, T1.AMOUNT*1.0 / T2.TOTAL AS TF 
    FROM (
        SELECT USER_ID, METRIC_ID, COUNT(ACCESS_DTTM) AS AMOUNT
        FROM data
        GROUP BY USER_ID, METRIC_ID
    ) AS T1
    JOIN (
        SELECT USER_ID, COUNT(ACCESS_DTTM) AS TOTAL
        FROM data
        GROUP BY USER_ID
    ) AS T2
    ON T1.USER_ID = T2.USER_ID
""".format(total_users=total_users)

res = sqlContext_H.sql(query(total_users)).persist()
# 3. Encode a string column of labels to a column of label indices
all_users = sqlContext_H.createDataFrame( 
        sc.textFile(HOST + "/csv/unique_users.csv").map(lambda p: Row(USER_ID=p)) 
    ).union( sqlContext_H.createDataFrame([Row(USER_ID='')]) ).persist()
print "all_users =", all_users.count()

try:
    all_metrics = sqlContext_H.createDataFrame( 
            sc.textFile(HOST + "/csv/unique_metrics.csv").map(lambda p: Row(METRIC_ID=p)) 
        ).union( sqlContext_H.createDataFrame([Row(METRIC_ID='')]) ).persist()
except:
    all_metrics = sqlContext_H.read.parquet( 
            HOST + "/parquet2/07/*",
            HOST + "/parquet2/08/*",
            HOST + "/parquet2/09/*"
        ).select("METRIC_ID").distinct().persist()
    all_metrics.write.format("com.databricks.spark.csv").mode('overwrite').save(HOST + "/csv/unique_metrics.csv")
    all_metrics = all_metrics.union( sqlContext_H.createDataFrame([Row(METRIC_ID='')]) )
print "METRIC_IDs = ", all_metrics.count()

indexerU = StringIndexer(inputCol="USER_ID", outputCol="USER_ID_Index").fit(all_users)
indexerM = StringIndexer(inputCol="METRIC_ID", outputCol="METRIC_ID_Index").fit(all_metrics)

all_users.unpersist()
all_metrics.unpersist()

indexedU_df = indexerU.transform(res).withColumn(
        "USER_ID_Index", F.col("USER_ID_Index").cast(IntegerType())
    ).persist()

table = indexerM.transform(indexedU_df).withColumn(
        "METRIC_ID_Index", F.col("METRIC_ID_Index").cast(IntegerType())
    ).persist()

res.unpersist()
indexedU_df.unpersist()
sqlContext_H.dropTempTable('data')

In [None]:
%%time 

t = table.select("USER_ID_Index", "METRIC_ID_Index", "TF").toPandas()

total_users = sc.textFile(HOST + "/csv/unique_users.csv").count()
total_metrics = sc.textFile(HOST + "/csv/unique_metrics.csv").count()

def sparse_df_to_array(df, shape):
    """ Convert sparse dataframe to sparse array csr_matrix used by scikit learn. """
    arr = lil_matrix(shape, dtype=np.float32)
    for i in range(df.shape[0]):
        arr[df.ix[i, "USER_ID_Index"]-1, df.ix[i, "METRIC_ID_Index"]-1] = df.ix[i, "TF"]
    return arr.tocsr()

m = sparse_df_to_array(t, (total_users, total_metrics))

svd = TruncatedSVD(n_components=35, n_iter=25, random_state=42)
svd.fit(m)
print "svd.explained_variance_ratio_.sum() =", svd.explained_variance_ratio_.sum()

#factors = svd.transform(m)
#remanufactured = svd.inverse_transform(factors)

"""m_idx = np.where(m.todense() > 0)
m_not_0 = m.todense()[m_idx].T
remanufactured_not_0 = remanufactured[m_idx].reshape(m_not_0.shape[0], 1)
print "RMSE:", np.sqrt(np.square(m_not_0 - remanufactured_not_0).mean())""";

table.unpersist()
del m

In [None]:
%%time

# Below 0 corresponds to the 7th month 
week_talbes = {0: t}
week_unique_metrics = {0: t["METRIC_ID_Index"].unique()}
del t


query = """
    SELECT T1.USER_ID, T1.METRIC_ID, T1.AMOUNT*1.0 / T2.TOTAL AS TF 
    FROM (
        SELECT USER_ID, METRIC_ID, COUNT(ACCESS_DTTM) AS AMOUNT
        FROM df
        GROUP BY USER_ID, METRIC_ID
    ) AS T1
    JOIN (
        SELECT USER_ID, COUNT(ACCESS_DTTM) AS TOTAL
        FROM df
        GROUP BY USER_ID
    ) AS T2
    ON T1.USER_ID = T2.USER_ID
"""

data_8_9 = sqlContext_H.read.parquet( 
        HOST + "/parquet2/08/*",
        HOST + "/parquet2/09/*"
    ).persist()
sqlContext_H.registerDataFrameAsTable(data_8_9, 'data_8_9')

#week_factors = {}
week_remanufactured = {}
week_matrices = {}

for w in range(31, 39):
    try: sqlContext_H.dropTempTable("df")
    except: pass
    print w
    df = data_8_9.filter("ACCESS_WEEK = {}".format(w)).select(*fields_list).persist()
    sqlContext_H.registerDataFrameAsTable(df, 'df')
    df_w = sqlContext_H.sql(query).persist()
    df.unpersist()
    df_indexedU = indexerU.transform(df_w).withColumn(
            "USER_ID_Index", F.col("USER_ID_Index").cast(IntegerType())
        ).persist()
    df_table = indexerM.transform(df_indexedU).withColumn(
            "METRIC_ID_Index", F.col("METRIC_ID_Index").cast(IntegerType())
        ).select( "USER_ID_Index", "METRIC_ID_Index", "TF" ).toPandas()
    df_w.unpersist()
    df_indexedU.unpersist()
    
    week_talbes[w] = df_table
    week_unique_metrics[w] = df_table["METRIC_ID_Index"].unique()
    
    df_m = sparse_df_to_array(df_table, (total_users, total_metrics))
    del df_table
    week_matrices[w] = df_m.todense()
    #week_factors[w] = svd.transform(df_m)
    week_factors = svd.transform(df_m)
    week_remanufactured[w] = svd.inverse_transform(week_factors)
    del df_m
    del week_factors
    

data_8_9.unpersist()
sqlContext_H.dropTempTable('data_8_9')
del total_users
del total_metrics

# 1. Remove unused metrics

In [None]:
%%time
if False:
    residuals = {}
    init_shape = week_remanufactured[31].shape
    print "Initial shape = {}".format(init_shape)
    diff = {}
    for w in range(31, 39):
        indices = np.array(list(set(week_unique_metrics[w]) | set(week_unique_metrics[0]))) - 1
        #residuals[w] = np.take(week_remanufactured[w], indices) - np.take(week_matrices[w], indices)
        residuals[w] = week_remanufactured[w][:, indices] - week_matrices[w][:, indices]
        print "Week = {} (shape = {}): AVG = {}, STD = {}".format(w, residuals[w].shape, np.average(residuals[w]), np.std(residuals[w]))
        np.savetxt("fm7/metrics_residuals_{}_common_values.txt".format(w), residuals[w], delimiter=",")
        diff[w] = (init_shape[1] - indices.size) * init_shape[0]
    avg_diff = np.average(diff.values()) 
    print "Difference: {0} ({1:.02f}%)".format(avg_diff, avg_diff / float(init_shape[0] * init_shape[1]) * 100)

In [None]:
%%time
residuals = {}
for w in range(31, 39):
    residuals[w] = np.loadtxt("fm7/metrics_residuals_{}_common_values.txt".format(w), delimiter=",")
    print "Week = {}: AVG = {}, STD = {}".format(w, np.average(residuals[w]), np.std(residuals[w]))

In [None]:
import matplotlib.mlab as mlab

b = 1000
#z = np.absolute(residuals[31].reshape(-1,))
z = residuals[31].reshape(-1,)
#z = z[(abs(z) > 0.05) & (abs(z) < 0.5)]
print "mean =", np.average(z), "variance =", np.std(z)
pdf, bins, patches = plt.hist(z, bins=b, alpha=0.5, normed=1)

x, dx = np.linspace(-1.5, 1.5, b, retstep=True)
y = mlab.normpdf(x, np.average(z), np.std(z))

print "Area formula:", np.trapz(y, dx=dx)
print "Area real:", np.sum(pdf * np.diff(bins))

plt.plot(x, y, 'r-')
axes = plt.gca()
axes.set_xlim([-1.1,1.1])
plt.show()

plt.hist(z, bins=b, alpha=0.5, normed=1)
plt.plot(x, y, 'r-')

axes.set_ylim([0,0.5])
axes.set_xlim([-1.1,1.1])
plt.show()

plt.hist(z, bins=b, alpha=0.5, normed=1)
plt.plot(x, y, 'r-')
axes = plt.gca()
axes.set_ylim([0,0.01])
axes.set_xlim([-1.1,1.1])
plt.show()

# 2. Remove unused metrics and non active users

In [None]:
%%time
if False:
    residuals = {}
    init_shape = week_remanufactured[31].shape
    print "Initial shape = {}".format(init_shape)
    diff = {}
    for w in range(31, 39):
        indices = np.array(list(set(week_unique_metrics[w]) | set(week_unique_metrics[0]))) - 1
        rows_0, _ = np.where( (week_matrices[w] == 0).all(axis=1) )
        rows = np.array(list(set(range(init_shape[0])) - set(rows_0)))
        residuals[w] = week_remanufactured[w][:, indices][rows, :] - week_matrices[w][:, indices][rows, :]
        print "Week = {} (shape = {}): AVG = {}, STD = {}".format(w, residuals[w].shape, np.average(residuals[w]), np.std(residuals[w]))
        np.savetxt("fm7/metrics_residuals_{}_common_values_without_non_active_users.txt".format(w), residuals[w], delimiter=",")
        diff[w] = init_shape[1] * init_shape[0] - indices.size * rows.size
    avg_diff = np.average(diff.values()) 
    print "Difference: {0} ({1:.02f}%)".format(avg_diff, avg_diff / float(init_shape[0] * init_shape[1]) * 100)

In [None]:
%%time
residuals = {}
for w in range(31, 39):
    residuals[w] = np.loadtxt("fm7/metrics_residuals_{}_common_values_without_non_active_users.txt".format(w), delimiter=",")
    print "Week = {}: AVG = {}, STD = {}".format(w, np.average(residuals[w]), np.std(residuals[w]))

In [None]:
print 12866 * 689
print 9710 * 668
print residuals[31].reshape(-1,).size

In [None]:
import matplotlib.mlab as mlab

b = 1000
#z = np.absolute(residuals[31].reshape(-1,))
z = residuals[31].reshape(-1,)
#z = z[(abs(z) > 0.05) & (abs(z) < 0.5)]
print "mean =", np.average(z), "variance =", np.std(z)
pdf, bins, patches = plt.hist(z, bins=b, alpha=0.5, normed=1)

x, dx = np.linspace(-1.5, 1.5, b, retstep=True)
y = mlab.normpdf(x, np.average(z), np.std(z))

print "Area formula:", np.trapz(y, dx=dx)
print "Area real:", np.sum(pdf * np.diff(bins))

plt.plot(x, y, 'r-')
axes = plt.gca()
axes.set_xlim([-1.1,1.1])
plt.show()

plt.hist(z, bins=b, alpha=0.5, normed=1)
plt.plot(x, y, 'r-')

axes.set_ylim([0,0.5])
axes.set_xlim([-1.1,1.1])
plt.show()

plt.hist(z, bins=b, alpha=0.5, normed=1)
plt.plot(x, y, 'r-')
axes = plt.gca()
axes.set_ylim([0,0.01])
axes.set_xlim([-1.1,1.1])
plt.show()

# 3. Remove unused metrics + users pairs

In [None]:
w0 = week_talbes[0].groupby("USER_ID_Index")['METRIC_ID_Index'].apply(lambda x: "%s" % ', '.join(map(lambda y: str(y), x))).to_frame()
w0.reset_index(level=0, inplace=True)
w31 = week_talbes[31].groupby("USER_ID_Index")['METRIC_ID_Index'].apply(lambda x: "%s" % ', '.join(map(lambda y: str(y), x))).to_frame()
w31.reset_index(level=0, inplace=True)
w = w0.join(w31, on='USER_ID_Index', how='outer', lsuffix='_left', rsuffix='_right')
w["metrics"] = (w["METRIC_ID_Index_left"] + ", " + w["METRIC_ID_Index_right"]).apply(
        lambda x: list(set(map(lambda y: int(y.strip()) if y != 'nan' else -1, str(x).split(","))))
    )
w.drop(["USER_ID_Index_left", "USER_ID_Index_right", "METRIC_ID_Index_left", "METRIC_ID_Index_right"], inplace=True, axis=1)
w.head()

In [None]:
#%%time
residuals = {}
init_shape = week_remanufactured[31].shape
print "Initial shape = {}".format(init_shape)

metrics_7 = week_talbes[0].groupby("USER_ID_Index")['METRIC_ID_Index'].apply(lambda x: "%s" % ', '.join(map(lambda y: str(y), x))).to_frame()
metrics_7.reset_index(level=0, inplace=True)
diff = {}
for w in range(31, 39):
    metrics_w = week_talbes[w].groupby("USER_ID_Index")['METRIC_ID_Index'].apply(lambda x: "%s" % ', '.join(map(lambda y: str(y), x))).to_frame()
    metrics_w.reset_index(level=0, inplace=True)
    joined = w0.join(metrics_w, on='USER_ID_Index', how='outer', lsuffix='_left', rsuffix='_right')
    joined["metrics"] = (joined["METRIC_ID_Index_left"] + ", " + joined["METRIC_ID_Index_right"]).apply(
            lambda x: list(set(map(lambda y: int(y.strip()) if y != 'nan' else -1, str(x).split(","))))
        )
    joined.drop(["USER_ID_Index_left", "USER_ID_Index_right", "METRIC_ID_Index_left", "METRIC_ID_Index_right"], inplace=True, axis=1)
    residuals[w] = np.array([[]])
    for i, vals in joined.iterrows():
        ind = [x-1 for x in vals["metrics"] if x>0]
        u = vals["USER_ID_Index"]-1
        residuals[w] = np.append(residuals[w], week_remanufactured[w][u, ind] - week_matrices[w][u, ind])
    print "Week = {} (shape = {}): AVG = {}, STD = {}".format(w, residuals[w].shape, np.average(residuals[w]), np.std(residuals[w]))
    np.savetxt("fm7/metrics_residuals_{}_only_present_metric_user_pairs.txt".format(w), residuals[w], delimiter=",")
    diff[w] = init_shape[0] * init_shape[1] - residuals[w].size
    del metrics_w
    del joined
avg_diff = np.average(diff.values()) 
print "Difference: {0} ({1:.02f}%)".format(avg_diff, avg_diff / float(init_shape[0] * init_shape[1]) * 100)

In [None]:
import matplotlib.mlab as mlab

b = 1000
#z = np.absolute(residuals[31].reshape(-1,))
z = residuals[31].reshape(-1,)
#z = z[(abs(z) > 0.05) & (abs(z) < 0.5)]
print "mean =", np.average(z), "variance =", np.std(z)
pdf, bins, patches = plt.hist(z, bins=b, alpha=0.5, normed=1)

x, dx = np.linspace(-1.5, 1.5, b, retstep=True)
y = mlab.normpdf(x, np.average(z), np.std(z))

print "Area formula:", np.trapz(y, dx=dx)
print "Area real:", np.sum(pdf * np.diff(bins))

plt.plot(x, y, 'r-')
axes = plt.gca()
axes.set_xlim([-1.1,1.1])
plt.show()

plt.hist(z, bins=b, alpha=0.5, normed=1)
plt.plot(x, y, 'r-')
axes = plt.gca()
axes.set_ylim([0,0.5])
axes.set_xlim([-1.1,1.1])
plt.show()

plt.hist(z, bins=b, alpha=0.5, normed=1)
plt.plot(x, y, 'r-')
axes = plt.gca()
axes.set_ylim([0,0.01])
axes.set_xlim([-1.1,1.1])
plt.show()

# SET

In [None]:
plt.figure(figsize=(16,10))
for i,j in enumerate(range(31, 39)):
    plt.subplot(4, 4, i+1)
    pdf, bins, patches = plt.hist(residuals[j].reshape(-1,), bins=100, alpha=0.75, normed=1)
    axes = plt.gca()
    axes.set_ylim([0,0.2])
    plt.title("week = {0}, area = {1:0.3f}".format(j, np.sum(pdf * np.diff(bins))))
plt.show()

In [None]:
plt.figure(figsize=(16,10))
for i,j in enumerate(range(31, 39)):
    plt.subplot(4, 4, i+1)
    plt.hist(residuals[j].reshape(-1,), bins=250, alpha=0.75)
    plt.title(j)
plt.show()

In [None]:
plt.figure(figsize=(16,10))
colors = ('r', 'g', 'b', 'y', 'c', 'm', 'grey', 'pink')
for i,j in enumerate(range(31, 39)):
    plt.hist(residuals[j].reshape(-1,), bins=100, alpha=0.4, color=colors[i])
axes = plt.gca()
axes.set_ylim([0,2500])
plt.show()