In [1]:
import cudf
import numpy as np
import pandas as pd
import plotly.express as px
import tools

# init paths to data and models
DATA_PATH = '/data/'

MODEL_NAME = 'wv3_ru'
WORD_MODEL_NAME = 'word_top_full_wv3_ru'
MODEL_PATH = f'{DATA_PATH}models/{MODEL_NAME}/'
EVENTS_PATH = DATA_PATH + 'events/lem/events.csv'
SOURCE_PATH = DATA_PATH + 'captions/lem/'
TMP_PATH = f'{DATA_PATH}tmp/{WORD_MODEL_NAME}/'


'success'

n_clusters = list(range(5, 151))
n_top = list(range(5, 51, 5))

In [2]:
cities = ['moscow', 'spb']
years = ['2017', '2018', '2019', '2020']

def csv_path(path, city, year):
    return path + city + '_posts_' + year + '.csv'

valid_langs = set(['__label__ru'])

In [3]:
from collections import defaultdict
import math

idf = defaultdict(int)
posts_len = 0

for city in cities:
    for year in years:
        df = pd.read_csv(csv_path(SOURCE_PATH, city, year))
        df = df[df.lang.isin(valid_langs)]
        for ind, row in df.iterrows():
            words = set(row['caption'].split())
            for word in words:
                idf[word] += 1
        posts_len += len(df)
        print(f'for {city}_{year} completed')
        
for word in idf:
    idf[word] = math.log(posts_len / idf[word])
    
!mkdir {TMP_PATH}
f = open(TMP_PATH + 'global_idf.txt', 'w')
for w in idf: 
    f.write(f'{w},{idf[w]}\n')
f.close()

for moscow_2017 completed
for moscow_2018 completed
for moscow_2019 completed
for moscow_2020 completed
for spb_2017 completed
for spb_2018 completed
for spb_2019 completed
for spb_2020 completed


In [3]:
with open(TMP_PATH + 'global_idf.txt', 'r') as f:
    idf = dict(map(lambda p: (p[0], float(p[1])), [line.split(',') for line in f]))


mkdir: cannot create directory ‘/data/tmp/word_top_full_wv3_ru/’: File exists


In [13]:
w, s = zip(*idf.items())
min(zip(s, w))

(0.6485702919965086, 'в')

In [14]:
max(zip(s, w))

(17.155032161371924, '𡘏')

In [15]:
min(zip(s, w))

(0.6485702919965086, 'в')

In [17]:
len(idf)

16463698

In [19]:
import gensim
model = gensim.models.Word2Vec.load(MODEL_PATH + 'mdl')
wv = model.wv
del model

In [21]:
from collections import Counter
df = tools.read_events(EVENTS_PATH)

norm = lambda m: m / (((m ** 2).sum(axis=1)) ** (1/2)).reshape(m.shape[0], 1)

for top in n_top:
    is_not_null = []
    X = []
    
    for document in df['description']:
        # counting words from document, which have vector in Word Embedding Model
        tf = Counter(filter(lambda w: (w in wv.vocab) and (w in idf), document.split()))
        is_not_null.append(len(tf) > 0)
        # exist small possibility that some document don't contain any word from the WEM   
        if len(tf) == 0:
            continue
        # calculating tf-idf 
        # it isn't necessary to divide tf into len(documents) due to normalizing vectors
        words, tf_idf = zip(*[(w, tf[w] * idf[w]) for w in tf])
        # sorting words of the document by tf_idf and slicing top of them
        score2word = sorted(zip(tf_idf, words), reverse=True)[:top]
        # calculating a vector for the document as sum(tf_idf(word) * vec(word)) 
        vec = sum(map(lambda p: p[0] * wv[p[1]], score2word))
        X.append(vec)
    
    X = norm(np.array(X))
    df = df[is_not_null]
    df = tools.calculate_2d(df, X)
    
    path = f'{TMP_PATH}/{top}/'
    !mkdir {path}
    np.save(path + 'X.npy', X)
    df.to_csv(r'' + path + 'df.csv', index=False)

In [17]:
df_cross = pd.read_csv('cross_valid_union.csv')
for top in n_top:
    path = f'{TMP_PATH}{top}/'
    df = tools.read_events(path + 'df.csv')
    X = np.load(path + 'X.npy')
    
    df = tools.k_means_list(n_clusters, df, X)
    df_scores = tools.calc_scores_list(n_clusters, df, df_cross)
    
    df_scores.to_csv(r'' + path + 'scores_km.csv', index=False)
    df.to_csv(r'' + path + 'df_km.csv', index=False)

In [22]:
df_cross = pd.read_csv('cross_valid_union.csv')
for top in n_top:
    path = f'{TMP_PATH}{top}/'
    df = tools.read_events(path + 'df.csv')
    X = np.load(path + 'X.npy')
    
    df = tools.agglomerative_list(n_clusters, df, X)
    df_scores = tools.calc_scores_list(n_clusters, df, df_cross)
    
    df_scores.to_csv(r'' + path + 'scores_ag.csv', index=False)
    df.to_csv(r'' + path + 'df_ag.csv', index=False)

In [25]:
best = tools.find_best_semantic_score(n_top, TMP_PATH, 'scores_ag.csv', 'f1')
print(best)

path = f'{TMP_PATH}{best["n_semantic"]}/'
df = tools.read_events(path + 'df_ag.csv')
scores = pd.read_csv(path + 'scores_ag.csv')

tools.plot_score(scores, 'f1').show()
tools.plot_events(df, best['n_clusters']).show()
centroids = tools.calc_centroids(df, best['n_clusters'])
fig, _ = tools.plot_centroids(centroids, size_max=80, size_text_tags=1)
fig.show()

tools.plot_scores_3d(n_top, n_clusters, TMP_PATH, 'scores_ag.csv', 'f1')

{'name': 'f1', 'value': 0.8669448307834955, 'n_clusters': 13, 'n_semantic': 20}
