In [None]:
# load in dicts
import os
import pyodbc
import pandas as pd
import time
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
import json
from pathlib import Path
from keras import backend as K
import datetime
import itertools

%matplotlib inline

plt.rcParams.update({
    "lines.color": "white",
    "patch.edgecolor": "white",
    "text.color": "white",
    "axes.facecolor": "black",
    "axes.edgecolor": "lightgray",
    "axes.labelcolor": "white",
    "xtick.color": "white",
    "ytick.color": "white",
    "grid.color": "lightgray",
    "figure.facecolor": "black",
    "figure.edgecolor": "black",
    "savefig.facecolor": "black",
    "savefig.edgecolor": "black",
    "font.weight": 'bold',
    "font.size": 18})


fig, ax1 = plt.subplots(figsize=(12, 8))
ae_loss = pd.read_csv('../../outputs/run_gd-tag-ae_loss.csv')
time_loss = pd.read_csv('../../outputs/run_gd-tag-time_loss.csv')
print(ae_loss.shape, time_loss.shape)

print(ae_loss.head())
ax1.plot(ae_loss['Value'], color='w', linewidth='2.5')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('AE Loss', color='w')
ax1.tick_params('y', colors='w')

ax2 = ax1.twinx()
ax2.plot(time_loss['Value'], color='orange', linewidth='2.5')
ax2.tick_params('y', colors='orange')
ax2.set_ylabel('Time Loss', color='orange')

fig.tight_layout()
plt.show()

In [None]:
def get_data(path):
    ''' Returns dataframe with columns: 'path', 'word'.'''
    datadir = Path(path)
    files = [(str(f), f.parts[-1]) for f in datadir.glob('*.csv') if f]
    df = pd.DataFrame(files, columns=['path', 'word'])
    
    return df

# @TODO
def time_score():
    print('not implemented')

def determine_nearest_neighbors(encodings, labels):
    print('test')

data_df = get_data('../../outputs/encoded_dicts/')
print(data_df)

In [None]:
encode_df = pd.DataFrame()
for i, row in data_df.iterrows():
    e = pd.read_csv(row['path'], converters={"gd_hidden": lambda x: x.strip("[]").split()})
    encode_df = pd.concat([encode_df, e])

print(encode_df.shape)
print(encode_df.columns)
print(encode_df.shape)
# encode_df = encode_df[:5000]

In [None]:
from sklearn.decomposition import PCA

gd_hidden_list = []
for index, row in encode_df.iterrows():
    gd_hidden_list.append(row['gd_hidden'])
X = np.asarray(gd_hidden_list).astype(float)
y = encode_df['label'].values

print(X[:5])

In [None]:
print(X.shape)
print(type(X))
print(type(X[0]))

pca = PCA(n_components=2)
X_transformed = pca.fit_transform(X)
print(pca.explained_variance_ratio_)  
# pca.transform(X)

In [None]:
plt.figure(figsize=(12, 8))
print(X_transformed.shape, y.shape)
fig, ax = plt.subplots()

indices = np.random.randint(0, X_transformed.shape[0], 500)

ax.scatter(X_transformed[indices, 0], X_transformed[indices, 1], 
           c=y[indices], cmap='ocean')
plt.show()

In [None]:
from sklearn.manifold import TSNE

indices = np.random.randint(0, X.shape[0], 2000)

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X[indices])

In [None]:
plt.figure(figsize=(12, 8))
print(X_transformed.shape, y.shape)
fig, ax = plt.subplots()

ax.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y[indices], cmap='Set1_r')
plt.show()

In [None]:
print(encode_df.shape)

sums = {'30':0, '60':0, '90':0, '180':0}
for i, row in encode_df.iterrows():
#     print(row['gd_time'], row['y_time'])
    if abs(row['gd_time'] - row['y_time']) <= 30:
        sums['30']+=1
        sums['60']+= 1
        sums['90']+= 1
        sums['180']+= 1
    elif abs(row['gd_time'] - row['y_time']) <= 60:
        sums['60']+= 1
        sums['90']+= 1
        sums['180']+= 1
    elif abs(row['gd_time'] - row['y_time']) <= 90:
        sums['90']+= 1
        sums['180']+= 1
    elif abs(row['gd_time'] - row['y_time']) <= 180:
        sums['180']+= 1

print(float(sums['30'])/encode_df.shape[0], float(sums['60'])/encode_df.shape[0],
      float(sums['90'])/encode_df.shape[0], float(sums['180'])/encode_df.shape[0])

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import paired_distances, pairwise_distances

# 3 compare distance to other sequences from same person
# 4 % of nearest neighbors that are same class
# 5 fit linear regression from hidden layer to time from index
# 6 scale hidden features by linear regression 
# 7 compare distances to other sequences from same person
# 8 % of nn that are same class

def avg_euclidean_dist(enc_list):
    enc_list = np.asarray(enc_list)
    dist_list = [np.linalg.norm(x[0]-x[1])
                 for x in itertools.combinations(enc_list, 2)]
    return sum(dist_list) / len(dist_list)

avg_gd_pre = []
avg_ae_pre = []
i = 0

encode_df['ae_hidden'] = encode_df['ae_hidden'].apply(lambda x: 
                                                      np.asarray(x.replace('[', '')\
                                                      .replace(']', '').split()).astype(float))
encode_df['gd_hidden'] = encode_df['gd_hidden'].apply(lambda x: 
                                                      np.asarray(x.replace('[', '')\
                                                      .replace(']', '').split()).astype(float))
gd_hidden_list = []
ae_hidden_list = []
for index, row in encode_df.iterrows():
    gd_hidden_list.append(row['gd_hidden'])
    ae_hidden_list.append(row['ae_hidden'])

gd_hidden_matrix = np.asarray(gd_hidden_list)
ae_hidden_matrix = np.asarray(ae_hidden_list)

# print('compute gd distance')
gd_distances = pairwise_distances(gd_hidden_matrix, n_jobs=-1)

# print('compute ae distance')
ae_distances = pairwise_distances(ae_hidden_matrix, n_jobs=-1)

members = {}
for memberNum in encode_df['memberNum'].unique():
    member = encode_df[encode_df['memberNum'] == memberNum]
    members[memberNum] = {'start': member.index.min(), 
                          'end':member.index.max(),
                          'label':encode_df.iloc[member.index.min()]['label']}
    try:
        avg_gd_pre.append(avg_euclidean_dist(member['gd_hidden'].astype(list)))
        avg_ae_pre.append(avg_euclidean_dist(member['ae_hidden'].astype(list)))
    except:
        pass

print(members.keys())
print(len(avg_gd_pre), len(avg_ae_pre))    
print(sum(avg_gd_pre)/len(avg_gd_pre),
      sum(avg_ae_pre)/len(avg_ae_pre)) 

print(sum(abs(np.var(gd_hidden_matrix, axis=0))), sum(abs(np.var(ae_hidden_matrix, axis=0))))

In [None]:
N = 10
# gd_neighbors = np.argsort(-gd_distances, axis=0)[-1:-1-N:-1]
# ae_neighbors = np.argsort(-ae_distances, axis=0)[-1:-1-N:-1]
gd_neighbors = np.argsort(-gd_distances, axis=0)[-1:-1-N:-1]
ae_neighbors = np.argsort(-ae_distances, axis=0)[-1:-1-N:-1]

In [None]:
print(gd_neighbors)

In [None]:
print(type(gd_neighbors), gd_neighbors.shape)

N = 10
gd_label_match = 0
gd_label_mismatch = 0

ae_label_match = 0
ae_label_mismatch = 0

# this is a slow implementation....
for key, val in members.items():
    print(key)
    for i in range(gd_neighbors.shape[1]): 
        count = 0
        for j in range(1, gd_neighbors.shape[0]):
            if count >= N:
                continue
            if gd_neighbors[j, i] in range(val['start'], val['end']):
                pass
            else:
                count += 1
                if val['label'] == encode_df.iloc[gd_neighbors[j, i]]['label']:
                    gd_label_match += 1
                else:
                    gd_label_mismatch += 1
     
    
    for i in range(ae_neighbors.shape[1]): 
        count = 0
        for j in range(1, ae_neighbors.shape[0]):
            if count >= N:
                continue
            if gd_neighbors[j, i] in range(val['start'], val['end']):
                pass
            else:
                count += 1
                if val['label'] == encode_df.iloc[ae_neighbors[j, i]]['label']:
                    ae_label_match += 1
                else:
                    ae_label_mismatch += 1          
        
print(gd_label_match, gd_label_mismatch)
print(ae_label_match, ae_label_mismatch)

In [None]:
import seaborn as sns
print(sum(avg_gd_pre) / len(avg_gd_pre))
print((sum(avg_ae_pre)) / len(avg_ae_pre))

df = pd.DataFrame({'TASeq2Seq': avg_gd_pre, 'RecurrentAE': avg_ae_pre})

plt.rcParams.update({
    "lines.color": "white",
    "patch.edgecolor": "white",
    "text.color": "white",
    "axes.facecolor": "black",
    "axes.edgecolor": "white",
    "axes.labelcolor": "white",
#     'axes.lines'
    "xtick.color": "white",
    "ytick.color": "white",
    "grid.color": "white",
    "figure.facecolor": "black",
    "figure.edgecolor": "white",
    "savefig.facecolor": "black",
    "savefig.edgecolor": "black",
    "font.weight": 'bold',
    "font.size": 18})

plt.figure(figsize=(12,8))
sns.violinplot(data=df)
sns.swarmplot(data=df, color='white')

In [None]:
# redo for corrected encoding

gd_hidden_list = []
ae_hidden_list = []
gd_corrected_list = []
for index, row in encode_df.iterrows():
    gd_hidden_list.append(row['gd_hidden'])
    ae_hidden_list.append(row['ae_hidden'])
    gd_corrected_list.append(row['gd_time_hidden'])

gd_corrected_matrix = np.asarray(gd_corrected_list)
gd_hidden_matrix = np.asarray(gd_hidden_list)
ae_hidden_matrix = np.asarray(ae_hidden_list)

# print('compute gd distance')
gd_distances = pairwise_distances(gd_hidden_matrix, n_jobs=-1)

gd_corrected_distances = pairwise_distances(gd_corrected_matrix, n_jobs=-1)

# print('compute ae distance')
ae_distances = pairwise_distances(ae_hidden_matrix, n_jobs=-1)

avg_gd_pre = []
avg_ae_pre = []
avg_corrected_pre = []

members = {}
for memberNum in encode_df['memberNum'].unique():
    member = encode_df[encode_df['memberNum'] == memberNum]
    members[memberNum] = {'start': member.index.min(), 
                          'end':member.index.max(),
                          'label':encode_df.iloc[member.index.min()]['label']}
    avg_gd_pre.append(avg_euclidean_dist(member['gd_hidden'].astype(list)))
    avg_corrected_pre.append(avg_euclidean_dist(member['gd_time_hidden'].astype(list)))
    avg_ae_pre.append(avg_euclidean_dist(member['ae_hidden'].astype(list)))

print(members.keys())
print(len(avg_gd_pre), len(avg_ae_pre), len(avg_corrected_pre))    
print(sum(avg_gd_pre)/len(avg_gd_pre),
      sum(avg_ae_pre)/len(avg_ae_pre),
      sum(avg_corrected_pre)/len(avg_corrected_pre)) 

print(sum(abs(np.var(gd_hidden_matrix, axis=0))), 
      sum(abs(np.var(ae_hidden_matrix, axis=0))),
      sum(abs(np.var(ae_hidden_matrix, axis=0))))
    
N = 50
# gd_neighbors = np.argsort(-gd_distances, axis=0)[-1:-1-N:-1]
# ae_neighbors = np.argsort(-ae_distances, axis=0)[-1:-1-N:-1]
gd_neighbors = np.argsort(-gd_distances, axis=0)[-1:-1-N:-1]
ae_neighbors = np.argsort(-ae_distances, axis=0)[-1:-1-N:-1]
corrected_neighbors = np.argsort(-gd_corrected, axis=0)[-1:-1-N:-1]