In [None]:
import cudf
import numpy as np
import pandas as pd
import plotly.express as px
import semlib
import tools

# init paths to data and models
DATA_PATH = '/data/'

SEM_MODEL_NAME = 'sm_wv3_ru_norm'
OPT_SEM = 70
SEM_MODEL_PATH = DATA_PATH + f'models/{SEM_MODEL_NAME}/{OPT_SEM}/'
IDS_PATH = DATA_PATH + 'psql/spb_posts_28_02_2020.csv'
POST_PATH = DATA_PATH + 'captions/lem/spb_posts_2020.csv'
TMP_PATH = f'{DATA_PATH}tmp/one_day/28_02_2020/'

'success'
 
#n_semantic = list(range(40, 130, 10)) + [150, 250, 500, 750, 1000]
n_clusters = list(range(5, 151))

In [None]:
df = pd.read_csv(IDS_PATH)
ids = set(df['id'].tolist())
print(f'posts in day: {len(ids)}')

df = pd.read_csv(POST_PATH)
print(f'posts in year: {len(df)}')

df = df[df['id'].isin(ids)]
print(f'non empty posts: {len(df)}')

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
    
!mkdir {TMP_PATH}

with open(f'{SEM_MODEL_PATH}labels.txt') as f:
    labels = list(map(lambda l: [l[0], int(l[1][:-1])], filter(lambda l: len(l) == 2, [line.split(',') for line in f])))
w2l = {} # word to label
for word, label in labels:
    w2l[word] = label 


# calculate semantic vectors for events
X = []
is_not_empty = []

for index, row in df.iterrows():
    vec = np.zeros(OPT_SEM)
    words = list(filter(lambda w: w in w2l, row.caption.split()))
    for word in words:
        vec[w2l[word]] += 1
    
    if len(words) > 0:
        X.append(vec / len(words))
        is_not_empty.append(True)
    else:
        is_not_empty.append(False)
        
X = np.array(X)
df = df[is_not_empty]
print(f'non empty after filtering words: {len(df)}')

# scaling of semantic vectors
scaler = StandardScaler()
X = scaler.fit_transform(X)
    

# calculate 2d embeding for events
tsne = TSNE(n_components=2, random_state=0, n_jobs=35, early_exaggeration=10, learning_rate=200)
X_2d = tsne.fit_transform(X) 
x_vals, y_vals = list(zip(*X_2d))
df['x'] = x_vals
df['y'] = y_vals

df.to_csv(r'' + TMP_PATH + 'df.csv', index=False)

In [None]:
max_row_len = 80
rows_hover_name = []

for index, row in df.iterrows():
    hover_name = ''
    row_len = 0
    words = row.caption.split()
    for word in words:
        if row_len + len(word) > max_row_len:
            hover_name += '<br>'
            row_len = 0
        hover_name += word + ' '
        row_len += len(word) + 1
    rows_hover_name.append(hover_name)
df['hover_name'] = rows_hover_name      

In [None]:
rows_hover_tags = []

for index, row in df.iterrows():
    hover_tags = ''
    words = row.caption.split()
    for word in words:
        if word[0] != '#' and word[0] != '@':
            continue
        hover_tags += word + '<br>'
    rows_hover_tags.append(hover_tags)
df['hover_tags'] = rows_hover_tags  

In [None]:
for index, row in df.iterrows():
    words = row.caption.split()
    for word in words:
        if word[1:] == 'rock':
            print(row.x, row.y)

In [None]:
import plotly.express as px

print(len(df))
px.scatter(df, x='x', y='y', hover_name='hover_name').show()

df_tmp = df[~(df['hover_tags'] == '')]
print(len(df_tmp))
px.scatter(df_tmp, x='x', y='y', hover_name='hover_tags').show()

In [None]:
px.scatter(df, x='x', y='y', hover_name='hover_name').show()

In [None]:
import semlib
df = semlib.agglomerative_list(n_clusters, df, X)

In [None]:
df.to_csv(r'' + TMP_PATH + 'df_ag.csv', index=False)

In [None]:
best = semlib.find_best_score(n_semantic, TMP_PATH, 'scores_union.csv', 'f1')
best

In [None]:
best = semlib.find_best_score(n_semantic, TMP_PATH, 'scores_ag.csv', 'f1')
best

In [None]:
fig = semlib.score_plot_3d(n_semantic, n_clusters, TMP_PATH, 'scores_ag.csv', scores_name='f1')
fig.show()

In [None]:
fig = semlib.score_plot_3d(n_semantic, n_clusters, TMP_PATH, 'scores_union.csv', scores_name='f1')
fig.show()

In [None]:
best = semlib.find_best_score(n_semantic, TMP_PATH, 'scores_ag.csv', 'f1')
print(f'score: {best[0]}, optimal_n_semantic: {best[1]}, optimal_n_clusters: {best[2]}')
model = f'{best[1]}/'
df = tools.read_events(TMP_PATH + model + 'df_ag.csv')
df_scores = pd.read_csv(TMP_PATH + model + 'scores_ag.csv')

tools.plot_score(df_scores, y=['f1', 'recall', 'precision']).show()

tools.plot_clusters(df, str(best[2])).show()

df_centroids = tools.create_centroids(df, int(best[2]), use_norm=False, hashtags_size=20)
fig, df_filtered = tools.plot_centroids(df_centroids, size_max=80, size_text_tags=1)
fig.show()
len(df_filtered)

In [None]:
best = semlib.find_best_score(n_semantic, TMP_PATH, 'scores_union.csv', 'f1')
print(f'score: {best[0]}, optimal_n_semantic: {best[1]}, optimal_n_clusters: {best[2]}')
model = f'{best[1]}/'
df = tools.read_events(TMP_PATH + model + 'df.csv')
df_scores = pd.read_csv(TMP_PATH + model + 'scores_union.csv')

tools.plot_score(df_scores, y=['f1', 'recall', 'precision']).show()

tools.plot_clusters(df, str(best[2])).show()

df_centroids = tools.create_centroids(df, int(best[2]), use_norm=False, hashtags_size=20)
fig, df_filtered = tools.plot_centroids(df_centroids, size_max=80, size_text_tags=1)
fig.show()
len(df_filtered)