In [24]:
import cudf
import numpy as np
import pandas as pd
import plotly.express as px
import tools

# init paths to data and models
DATA_PATH = '/data/'

SEM_MODEL_NAME = 'sm_wv4_ru_norm'
OPT_SEM = 100
SEM_MODEL_PATH = DATA_PATH + f'models/{SEM_MODEL_NAME}/'
IDS_PATH = DATA_PATH + 'psql/spb_posts_28_02_2020.csv'
POST_PATH = DATA_PATH + 'captions/lem/spb_posts_2020.csv'
DATE = '28_02_2020'
TMP_PATH = f'{DATA_PATH}tmp/one_day/'
!mkdir {TMP_PATH}
FULL_PATH = f'{TMP_PATH}{DATE}/'
!mkdir {FULL_PATH}
n_clusters = list(range(5, 251))

mkdir: cannot create directory ‘/data/tmp/one_day/’: File exists
mkdir: cannot create directory ‘/data/tmp/one_day/28_02_2020/’: File exists


In [None]:
PGPASSWORD=secretpwd psql -h 10.9.14.132 -U secretuser -d spb -c "\copy (select shortcode as code from posts where timestamp between 1582848000 and 1582934400) to 'spb_posts_28_02_2020.csv' csv header;"

In [13]:
df = pd.read_csv(IDS_PATH)
ids = set(df['code'].tolist())
print(f'posts in day: {len(ids)}')

df = pd.read_csv(POST_PATH)
print(f'posts in year: {len(df)}')

df = df[df['code'].isin(ids)]
print(f'non empty posts: {len(df)}')

!mkdir {TMP_PATH}
df.to_csv(r'' + TMP_PATH + 'df.csv', index=False)

posts in day: 21806
posts in year: 2353209
non empty posts: 18925


In [22]:
df.to_csv(r'' + FULL_PATH + 'df_lemmed.csv', index=False)

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE


df_words = pd.read_csv(SEM_MODEL_PATH)


# calculate semantic vectors for events
X = []
is_not_empty = []

for index, row in df.iterrows():
    vec = np.zeros(OPT_SEM)
    words = list(filter(lambda w: w in w2l, row.caption.split()))
    for word in words:
        vec[w2l[word]] += 1
    
    if len(words) > 0:
        X.append(vec / len(words))
        is_not_empty.append(True)
    else:
        is_not_empty.append(False)
        
X = np.array(X)
df = df[is_not_empty]
print(f'non empty after filtering words: {len(df)}')

# scaling of semantic vectors
scaler = StandardScaler()
X = scaler.fit_transform(X)
    

# calculate 2d embeding for events
tsne = TSNE(n_components=2, random_state=0, n_jobs=35, early_exaggeration=10, learning_rate=200)
X_2d = tsne.fit_transform(X) 
x_vals, y_vals = list(zip(*X_2d))
df['x'] = x_vals
df['y'] = y_vals

df.to_csv(r'' + TMP_PATH + 'df.csv', index=False)
np.save(TMP_PATH + 'X.npy', X)

mkdir: cannot create directory ‘/data/tmp/one_day/28_02_2020/’: File exists


NameError: name 'df' is not defined

In [27]:
X = np.load(FULL_PATH + 'X.npy', X)

In [33]:
x = X[1]
print(np.sqrt((x ** 2).sum()))
np.linalg.norm(x)

3.9990891250458485


3.999089125045849

In [None]:
max_row_len = 80
rows_hover_name = []

for index, row in df.iterrows():
    hover_name = ''
    row_len = 0
    words = row.caption.split()
    for word in words:
        if row_len + len(word) > max_row_len:
            hover_name += '<br>'
            row_len = 0
        hover_name += word + ' '
        row_len += len(word) + 1
    rows_hover_name.append(hover_name)
df['hover_name'] = rows_hover_name      

In [None]:
rows_hover_tags = []

for index, row in df.iterrows():
    hover_tags = ''
    words = row.caption.split()
    for word in words:
        if word[0] != '#' and word[0] != '@':
            continue
        hover_tags += word + '<br>'
    rows_hover_tags.append(hover_tags)
df['hover_tags'] = rows_hover_tags  

In [None]:
import plotly.express as px

print(len(df))
px.scatter(df, x='x', y='y', hover_name='hover_name').show()

df_tmp = df[~(df['hover_tags'] == '')]
print(len(df_tmp))
px.scatter(df_tmp, x='x', y='y', hover_name='hover_tags').show()

In [None]:
df = tools.agglomerative_list(n_clusters, df, X)

In [7]:
from collections import Counter
from operator import itemgetter
df_centroids = pd.DataFrame([], columns=['x', 'y', 'label', 'hover_name', 'name', 'size'])
top_words = 20
n_clusters = 75

for i in range(n_clusters):
    cluster = df[df[str(n_clusters)] == i]
    x = cluster['x'].mean()
    y = cluster['y'].mean()
    t = ' '.join(cluster['caption'])
        
    name = Counter(t.split()).most_common(top_words)
    hover_name = '<br>'.join(map(itemgetter(0), name))
    df_centroids.loc[len(df_centroids)] = [x, y, float(i), hover_name, name, np.int64(len(cluster))]

df_centroids['size'] = df_centroids['size'].astype(np.int64)

NameError: name 'df' is not defined

In [8]:
px.scatter(df, x='x', y='y', color=str(n_clusters), hover_name='hover_name').show()

df_centroids['text'] = df_centroids['hover_name'].apply(lambda s: '<br>'.join(s.split('<br>')[:1]))
px.scatter(df_centroids, x="x", y="y", color='label', text='text', size='size', hover_name='hover_name', size_max=80).show()

NameError: name 'df' is not defined

In [9]:
IDF_PATH = f'{DATA_PATH}captions/idf_ru.txt'
with open(IDF_PATH, 'r') as f:
    idf = dict(map(lambda p: (p[0], float(p[1])), [line.split(',') for line in f]))
    
from operator import itemgetter
top = 20

for i in range(n_clusters):
    cluster = df[df[str(n_clusters)] == i]
    x = cluster['x'].mean()
    y = cluster['y'].mean()
    # counting words from document, which have vector in Word Embedding Model
    tf = Counter(filter(lambda w: (w in wv.vocab) and (w in idf), document.split()))
    # calculating tf-idf 
    tf_idf = [(w, tf[w] * idf[w]) for w in tf]
    top_words = sorted(tf_idf, key=itemgetter(1), reverse=True)[:top]
    # sorting words of the document by tf_idf and slicing top of them
    hover_name = '<br>'.join([f'{w}: {s}' for w, s in top_words])
    
    t = ' '.join(cluster['caption'])
        
    name = [w for w, s in top_words]
    df_centroids.loc[len(df_centroids)] = [x, y, float(i), hover_name, name, np.int64(len(cluster))]
    
df_centroids['size'] = df_centroids['size'].astype(np.int64)

NameError: name 'df' is not defined