In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import logging
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# init paths to data and models
DATA_PATH = '/data/'
MODEL_NAME = 'd2v2_ru'

MODEL_PATH = DATA_PATH + f'models/{MODEL_NAME}/'
EVENTS_PATH = DATA_PATH + 'events/cleaned/events.csv'
SOURCE_PATH = DATA_PATH + 'captions/lem/'
cities = ['moscow', 'spb']
years = ['2016', '2017', '2018', '2019', '2020']

TMP_PATH = f'{DATA_PATH}tmp/d2v/'

valid_langs = set(['__label__ru'])
calc_scrore_names = ['Calinski–Harabasz']
    
def csv_path(path, city, year):
    return path + city + '_posts_' + year + '.csv'

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

loading posts and preparing them for training

In [None]:
documents = []

for city in cities:
    for year in years:
        df = pd.read_csv(csv_path(SOURCE_PATH, city, year))
        # filtering useless languages
        df = df[df.lang.isin(valid_langs)]
        documents += [TaggedDocument(doc.split(), [i + size]) for i, doc in enumerate(df['caption'])]
        del df
    print(f'for {city} loaded posts')
    
print(f'\nloaded all posts {len(documents)}\n')

training the model

In [None]:
model = Doc2Vec(documents, vector_size=100, window=15, min_count=5, negative=15, dm=1, dbow_words=1, epochs=10, workers=35)
!mkdir {MODEL_PATH}
model.save(MODEL_PATH + 'mdl')

if the model has been trained, we load saved model 

In [2]:
model = Doc2Vec.load(MODEL_PATH + 'mdl')

2020-07-07 23:41:54,766 : INFO : loading Doc2Vec object from /data/models/d2v2_ru/mdl
2020-07-07 23:42:17,684 : INFO : loading vocabulary recursively from /data/models/d2v2_ru/mdl.vocabulary.* with mmap=None
2020-07-07 23:42:17,687 : INFO : loading trainables recursively from /data/models/d2v2_ru/mdl.trainables.* with mmap=None
2020-07-07 23:42:17,690 : INFO : loading syn1neg from /data/models/d2v2_ru/mdl.trainables.syn1neg.npy with mmap=None
2020-07-07 23:42:21,548 : INFO : loading vectors_docs_lockf from /data/models/d2v2_ru/mdl.trainables.vectors_docs_lockf.npy with mmap=None
2020-07-07 23:42:22,105 : INFO : loading wv recursively from /data/models/d2v2_ru/mdl.wv.* with mmap=None
2020-07-07 23:42:22,107 : INFO : loading vectors from /data/models/d2v2_ru/mdl.wv.vectors.npy with mmap=None
2020-07-07 23:42:25,883 : INFO : loading docvecs recursively from /data/models/d2v2_ru/mdl.docvecs.* with mmap=None
2020-07-07 23:42:25,886 : INFO : loading vectors_docs from /data/models/d2v2_ru/mdl

This cell prepare events vectors:
1. infering vectors for events by the doc2vec model (vectors for euclidean distance)
2. building normilized vectors for cosine distance
3. building 2d embedding by t-sne for euclidean distance and cosine distance
4. saving data to tmp dir

In [3]:
from sklearn.manifold import TSNE

df = pd.read_csv(EVENTS_PATH)
X = []
for index, row in df.iterrows():
    words = row.description.split()
    vec = model.infer_vector(words)
    X.append(vec)
# vectors for events with euclidean distance    
X = np.array(X)

# vectors for events with cosine distance    
X_norm = X / (((X ** 2).sum(axis=1)) ** (1/2)).reshape(X.shape[0], 1)

# calculate 2d embeding for events with euclidean distance 
tsne = TSNE(n_components=2, random_state=0, n_jobs=35, early_exaggeration=10, learning_rate=200)
df['x'], df['y'] = list(zip(*tsne.fit_transform(X)))

# calculate 2d embeding for events with cosine distance 
tsne = TSNE(n_components=2, random_state=0, n_jobs=35, early_exaggeration=10, learning_rate=200)
df['x_norm'], df['y_norm'] = list(zip(*tsne.fit_transform(X_norm)))

# saving results of this cell
!mkdir {TMP_PATH}
np.save(TMP_PATH + 'X.npy', X)
np.save(TMP_PATH + 'X_norm.npy', X_norm)
df.to_csv(r'' + TMP_PATH + 'df.csv', index=False)
print(f'completed')

mkdir: cannot create directory ‘/data/tmp/d2v/’: File exists
completed


restoring data from previos step

In [3]:
X = np.load(TMP_PATH + 'X.npy')
X_norm = np.load(TMP_PATH + 'X_norm.npy')
df = pd.read_csv(TMP_PATH + 'df.csv')

In [55]:
#from sklearn.metrics import davies_bouldin_score
#from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from cuml.cluster import KMeans

calc_scores_names = ['Calinski_Harabasz']
data_scores_names = ['precision', 'recall', 'f1', 'rand', 'tp', 'tn', 'fp', 'fn']


def make_norm_names(names: list):
    return list(map(lambda name: name + '_norm', names))


calc_scores_names_norm = make_norm_names(calc_scores_names)
data_scores_names_norm = make_norm_names(data_scores_names)

df_cross = pd.read_csv('cross_valid.csv')
lname = {'positive': 2, 'negative': 1}

# return (use_euclidean: bool, use_cosine: bool)
def make_metrics_flag(metric: str):
    return (True if metric != 'cosine' else False), (True if metric != 'euclidean' else False) 


# calulate scores for n clusters of events
# return list of calc_scrore_names scores
def calc_scrore(n, use_normilized):
    X_calc = X_norm if use_normilized else X
    kmeans = KMeans(n_clusters=n, n_init=50).fit(X_calc)
    #dbs = davies_bouldin_score(X_calc, kmeans.labels_)
    #sils = silhouette_score(dists, kmeans.labels_)
    chs = calinski_harabasz_score(X_calc, kmeans.labels_)
    return kmeans.labels_, [chs]#[dbs, sils, chs]


# calculate scores for list of different numbers of clusters - ns
# return pandas DataFrame, where row contain n_clustres and different scores for this n_clustres
def n_scores(ns: list, metric='euclidean'):
    use_euclidean, use_cosine = make_metrics_flag(metric)
    
    columns = ['n_clusters']         
    columns += calc_scores_names if use_euclidean else []
    columns += calc_scores_names_norm if use_cosine else []
    
    ans_scores = []
    
    for n in ns:
        scores = []
        if use_euclidean:
            l, scores_tmp = calc_scrore(n, False)
            scores += scores_tmp
            df[str(n)] = l
        
        if use_cosine:
            l, scores_tmp = calc_scrore(n, True)
            scores += scores_tmp
            df[str(n) + '_norm'] = l
        ans_scores.append([n] + scores) 

    df_scores = pd.DataFrame(ans_scores, columns=columns) 
    df_scores =  data_n_scores(ns, df_scores, metric)
    return df_scores

def data_score(events):
    events = events.values.tolist()
    tp, tn, fp, fn = 0, 0, 0, 0
    d = {}
    for event, l in events:
        d[event] = l
    
    for _, row in df_cross.iterrows():
        a = row['id_a']
        b = row['id_b']
        l = row['label']
        
        tp += 1 if d[a] == d[b] and l == lname['positive'] else 0
        tn += 1 if d[a] != d[b] and l == lname['negative'] else 0
        fp += 1 if d[a] == d[b] and l == lname['negative'] else 0
        fn += 1 if d[a] != d[b] and l == lname['positive'] else 0
    
    #print(f'tp: {tp}\ntn: {tn}\nfp: {fp}\nfn: {fn}\n')
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)    
    rand = (tp + tn) / (tp + tn + fp + fn)
    #print(f'precision: {precision}\nrecall: {recall}\nf1: {f1}\n')
    return [precision, recall, f1, rand, tp, tn, fp, fn]
    
def data_n_scores(ns: list, df_scores, metric='euclidean'):
    use_euclidean, use_cosine = make_metrics_flag(metric)
    scores = []
    for n in ns:
        score = []
        score += data_score(df[['id', str(n)]]) if use_euclidean else []
        score += data_score(df[['id', str(n) + '_norm']]) if use_cosine else []
        scores.append(score)
    
    columns = []
    columns += data_scores_names if use_euclidean else []
    columns += data_scores_names_norm if use_cosine else []
    
    transposed = list(zip(*scores))
    for i in range(len(columns)):
        df_scores[columns[i]] = transposed[i]
    return df_scores
        

In [56]:
n_samples = list(range(5, 101))

df_scores = n_scores(n_samples, metric='all')
df_scores.to_csv(r'' + TMP_PATH + 'scores.csv', index=False)
df.to_csv(r'' + TMP_PATH + 'df.csv', index=False)


          id  5
0      40526  0
1      40527  0
2      40528  0
3      40529  0
4      40530  0
...      ... ..
10209  15761  0
10210  15762  3
10211  15763  3
10212  15764  3
10213  15765  0

[10214 rows x 2 columns]
          id  5_norm
0      40526       0
1      40527       2
2      40528       2
3      40529       2
4      40530       4
...      ...     ...
10209  15761       0
10210  15762       1
10211  15763       0
10212  15764       3
10213  15765       4

[10214 rows x 2 columns]
          id  6
0      40526  2
1      40527  2
2      40528  2
3      40529  2
4      40530  2
...      ... ..
10209  15761  2
10210  15762  1
10211  15763  1
10212  15764  1
10213  15765  2

[10214 rows x 2 columns]
          id  6_norm
0      40526       4
1      40527       2
2      40528       2
3      40529       3
4      40530       3
...      ...     ...
10209  15761       4
10210  15762       1
10211  15763       4
10212  15764       0
10213  15765       3

[10214 rows x 2 columns]
        

          id  21
0      40526   6
1      40527   6
2      40528   6
3      40529   6
4      40530   6
...      ...  ..
10209  15761   0
10210  15762   9
10211  15763   0
10212  15764   9
10213  15765  13

[10214 rows x 2 columns]
          id  21_norm
0      40526        0
1      40527        7
2      40528        0
3      40529       16
4      40530        1
...      ...      ...
10209  15761       18
10210  15762       14
10211  15763       18
10212  15764       15
10213  15765        3

[10214 rows x 2 columns]
          id  22
0      40526   0
1      40527   0
2      40528   0
3      40529   0
4      40530   0
...      ...  ..
10209  15761   0
10210  15762   9
10211  15763   9
10212  15764  16
10213  15765   4

[10214 rows x 2 columns]
          id  22_norm
0      40526        0
1      40527       11
2      40528       11
3      40529        8
4      40530        9
...      ...      ...
10209  15761        0
10210  15762       21
10211  15763        0
10212  15764       13
10213  1

          id  37
0      40526  12
1      40527  31
2      40528  31
3      40529  31
4      40530  12
...      ...  ..
10209  15761  12
10210  15762  26
10211  15763  15
10212  15764   0
10213  15765  34

[10214 rows x 2 columns]
          id  37_norm
0      40526        9
1      40527       29
2      40528       29
3      40529        8
4      40530       28
...      ...      ...
10209  15761       17
10210  15762       15
10211  15763       17
10212  15764        0
10213  15765       14

[10214 rows x 2 columns]
          id  38
0      40526  19
1      40527  19
2      40528  19
3      40529  19
4      40530  19
...      ...  ..
10209  15761  19
10210  15762   6
10211  15763  34
10212  15764  13
10213  15765  22

[10214 rows x 2 columns]
          id  38_norm
0      40526       30
1      40527        7
2      40528        7
3      40529        3
4      40530        2
...      ...      ...
10209  15761       13
10210  15762       23
10211  15763       13
10212  15764       27
10213  1

          id  53
0      40526  35
1      40527  28
2      40528  28
3      40529  28
4      40530  35
...      ...  ..
10209  15761  35
10210  15762  49
10211  15763  16
10212  15764   5
10213  15765   9

[10214 rows x 2 columns]
          id  53_norm
0      40526       23
1      40527       20
2      40528       16
3      40529       27
4      40530       50
...      ...      ...
10209  15761        7
10210  15762       46
10211  15763        7
10212  15764       52
10213  15765       13

[10214 rows x 2 columns]
          id  54
0      40526  44
1      40527  45
2      40528  45
3      40529  45
4      40530  45
...      ...  ..
10209  15761  44
10210  15762  34
10211  15763  47
10212  15764   2
10213  15765   0

[10214 rows x 2 columns]
          id  54_norm
0      40526        3
1      40527        7
2      40528       40
3      40529       17
4      40530       29
...      ...      ...
10209  15761        3
10210  15762       42
10211  15763        1
10212  15764       34
10213  1

          id  69
0      40526  50
1      40527  51
2      40528  51
3      40529  51
4      40530  50
...      ...  ..
10209  15761  50
10210  15762  61
10211  15763  64
10212  15764  49
10213  15765  37

[10214 rows x 2 columns]
          id  69_norm
0      40526       50
1      40527       67
2      40528       29
3      40529       46
4      40530        8
...      ...      ...
10209  15761       57
10210  15762       68
10211  15763       60
10212  15764       17
10213  15765       16

[10214 rows x 2 columns]
          id  70
0      40526  60
1      40527  66
2      40528  66
3      40529  66
4      40530  60
...      ...  ..
10209  15761  60
10210  15762  49
10211  15763  20
10212  15764  23
10213  15765   6

[10214 rows x 2 columns]
          id  70_norm
0      40526       10
1      40527       13
2      40528       33
3      40529       59
4      40530        9
...      ...      ...
10209  15761       25
10210  15762       58
10211  15763       29
10212  15764       55
10213  1

          id  85_norm
0      40526       10
1      40527       30
2      40528       48
3      40529       39
4      40530        5
...      ...      ...
10209  15761       10
10210  15762       37
10211  15763       71
10212  15764       26
10213  15765       69

[10214 rows x 2 columns]
          id  86
0      40526  17
1      40527  21
2      40528  17
3      40529  61
4      40530  17
...      ...  ..
10209  15761  17
10210  15762  52
10211  15763  12
10212  15764  79
10213  15765  68

[10214 rows x 2 columns]
          id  86_norm
0      40526       82
1      40527       68
2      40528       70
3      40529       51
4      40530       84
...      ...      ...
10209  15761       38
10210  15762        5
10211  15763       62
10212  15764       60
10213  15765       73

[10214 rows x 2 columns]
          id  87
0      40526  38
1      40527  47
2      40528  38
3      40529  47
4      40530  38
...      ...  ..
10209  15761  38
10210  15762  60
10211  15763  14
10212  15764  37
102

In [4]:
df_scores = pd.read_csv(TMP_PATH + 'scores.csv')

In [5]:
best_score = df_scores.iloc[df_scores['f1'].idxmax()]
best = (best_score['f1'], best_score['n_clusters'])
best_score_norm = df_scores.iloc[df_scores['f1_norm'].idxmax()]
best_norm = (best_score_norm['f1_norm'], best_score_norm['n_clusters'])
print(f'euclidean: {best}\ncosine:    {best_norm}')

euclidean: (0.2372881355932203, 99.0)
cosine:    (0.3886255924170616, 8.0)


In [None]:
best_score_norm = df_scores.iloc[df_scores['f1_norm'].idxmax()]
best = (best_score_norm['precision_norm'], best_score_norm['recall_norm'], best_score_norm['f1_norm'])
best

In [25]:
best_score = df_scores.iloc[df_scores['f1'].idxmax()]
best = (best_score['f1'], best_score['n_clusters'])
best_score_norm = df_scores.iloc[df_scores['f1_norm'].idxmax()]
best_norm = (best_score_norm['f1_norm'], best_score_norm['n_clusters'])
print(f'euclidean: {best}\ncosine:    {best_norm}')

euclidean: (0.22968197879858657, 42.0)
cosine:    (0.40782122905027934, 11.0)


In [35]:
def plot_df(df, use_norm=False, label='label'):
    suffix = '_norm' if use_norm else ''
    fig = px.scatter(df, x="x" + suffix, y="y" + suffix, color=label, hover_name='title')
    fig.show()
    


In [36]:
plot_df(df, False, '42')

In [37]:
plot_df(df, True, '11')

In [39]:
best_score_norm = df_scores.iloc[df_scores['Calinski_Harabasz_norm'].idxmax()]
best_norm = (best_score_norm['Calinski_Harabasz_norm'], best_score_norm['n_clusters'])
best_norm

(536.7915938542188, 5.0)

In [47]:
fig = px.line(df_scores, x="n_clusters", y='f1_norm')
fig.show()

In [49]:
plot_df(df, True, '23')

In [54]:
from ipywidgets import IntProgress
from IPython.display import display
from sklearn.metrics import calinski_harabasz_score
from sklearn.cluster import AgglomerativeClustering

delta = 1
samples = [30]#range(23)

#f = IntProgress(min=samples[0], max=samples[-1]) # instantiate the bar
#display(f)
scores = []
for n_clusters in samples:
    ac = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward").fit(X)
    df['label'] = ac.labels_
    ch = calinski_harabasz_score(X_norm, ac.labels_)
    #scores.append([ch] + evaluate_score(df[['id', 'label']]))
    #f.value += delta
    
        
#ch, p, r, f1, rand, tp, tn, fp, fn = list(zip(*scores))
plot_df(df, True, 'label')

In [None]:
def read_csv(model):
    path = f'/data/tmp/{model}/scores.csv'
    df_tmp = pd.read_csv(path)
    df_tmp['model'] = [f'{model}-{n_sem}'] * len(df_tmp) 
    df_tmp['model_tmp'] = [tmp[model]] * len(df_tmp) 
    return df_tmp

#models = ['sm_wv2', 'sm_wv2_ru', 'sm_wv2_ru_norm_enorm', 'sm_wv2_ru_enorm']
#models = ['d2v_ru']
#models = ['sm_wv2_ru']




for name in calc_scrore_names + ['f1']:
    fig = px.line(df_scores, x="n_clusters", y=name)
    fig.show()

In [None]:
from pyclustertend import hopkins
sample_sizes = 1000
hopkins_ans = hopkins(X, sample_sizes)
print(hopkins_ans)