this notebook train doc2vec model on all posts captions

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import logging
import tools

import d2vlib
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# init paths to data and models
DATA_PATH = '/data/'
MODEL_NAME = 'd2v2'

MODEL_PATH = DATA_PATH + f'models/{MODEL_NAME}/'
EVENTS_PATH = DATA_PATH + 'events/lem/events.csv'
SOURCE_PATH = DATA_PATH + 'captions/lem/'
cities = ['moscow', 'spb']
years = ['2016', '2017', '2018', '2019', '2020']

TMP_PATH = f'{DATA_PATH}tmp/{MODEL_NAME}/'

valid_langs = set(['__label__ru'])
calc_scrore_names = ['Calinski–Harabasz']
    
def csv_path(path, city, year):
    return path + city + '_posts_' + year + '.csv'

n_samples = list(range(5, 101)) # the number of clusters to choose the optimal number
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

loading posts and preparing them for training

In [2]:
documents = []

for city in cities:
    for year in years:
        df = pd.read_csv(csv_path(SOURCE_PATH, city, year))
        # filtering useless languages
        df = df[df.lang.isin(valid_langs)]
        size = len(documents)
        documents += [TaggedDocument(doc.split(), [i + size]) for i, doc in enumerate(df['caption'])]
        del df
    print(f'for {city} loaded posts')
    
print(f'\nloaded all posts {len(documents)}\n')

for moscow loaded posts
for spb loaded posts

loaded all posts 28276058



training doc2vec model

In [None]:
model = Doc2Vec(documents, vector_size=100, window=10, min_count=5, negative=15, dm=1, dbow_words=1, epochs=40, workers=35)
!mkdir {MODEL_PATH}
model.save(MODEL_PATH + 'mdl')

This cell prepare events vectors:
1. infering vectors for events by the doc2vec model (vectors for euclidean distance)
2. building normilized vectors for cosine distance
3. building 2d embedding by t-sne for euclidean distance and cosine distance
4. saving data to tmp dir

In [None]:
!mkdir {TMP_PATH}
df, X, X_norm = d2vlib.prepare_model(EVENTS_PATH, MODEL_PATH, TMP_PATH)

clustering of events, for each n from n_samples labels is saved in dataframe <br>
evaluating f1-score for each clustering

In [None]:
df = tools.read_events(TMP_PATH + 'df.csv')
df_cross = pd.read_csv('cross_valid_union.csv')

df, df_scores = d2vlib.k_means_list(df, n_samples, TMP_PATH, metric='all')
df_scores = d2vlib.calc_scores_list(df, n_samples, df_cross, scores=df_scores)

df.to_csv(r'' + TMP_PATH + 'df_km.csv', index=False)
df_scores.to_csv(r'' + TMP_PATH + 'scores_km.csv', index=False)

finding the best f1-score and plot f1-score for diffferent number of clusters

In [7]:
from tools import find_best, plot_score, plot_clusters

df = tools.read_events(TMP_PATH + 'df.csv')
df_scores = pd.read_csv(TMP_PATH + 'scores.csv')

print('f1 euclidian:', find_best(df_scores, 'f1'))
print('f1 cosine: ', find_best(df_scores, 'f1_norm'))

plot_score(df_scores, y=['f1', 'f1_norm']).show()


f1 euclidian: (0.3236074270557029, 48.0)
f1 cosine:  (0.3964365256124721, 19.0)


In [7]:
plot_clusters(df, '33_norm')

In [15]:
df_centroids = tools.create_centroids(df, 33, use_norm=True, hashtags_size=20)
fig, _ = tools.plot_centroids(df_centroids, 50)
fig.show()

In [16]:
df_centroids['text'] = df_centroids['hover_name'].apply(lambda s: '<br>'.join(s.split('<br>')[:2]))
fig = px.scatter(df_centroids, x="x", y="y", color='label', text='text', size='len', hover_name='hover_name', size_max=50)
fig.show()

In [10]:
plot_clusters(df, '28')

In [18]:
df_centroids = tools.create_centroids(df, 28, hashtags_size=20)
fig, _ = tools.plot_centroids(df_centroids, 100)
fig.show()

In [19]:
df_centroids['text'] = df_centroids['hover_name'].apply(lambda s: '<br>'.join(s.split('<br>')[:2]))
fig = px.scatter(df_centroids, x="x", y="y", color='label', text='text', size='len', hover_name='hover_name', size_max=100)
fig.show()

In [21]:
print('ch euclidian:', find_best(df_scores, 'calinski_harabasz'))
print('ch cosine: ', find_best(df_scores, 'calinski_harabasz_norm'))

plot_score(df_scores, y=['calinski_harabasz', 'calinski_harabasz_norm']).show()

ch euclidian: (496.5271326790549, 5.0)
ch cosine:  (307.0901457286604, 5.0)


In [10]:
best_score = df_scores.iloc[df_scores['f1_norm'].idxmax()]

In [11]:
best_score

n_clusters                  17.000000
precision                    0.156140
recall                       0.420605
f1                           0.227738
rand                         0.600424
tp                         445.000000
tn                        4090.000000
fp                        2405.000000
fn                         613.000000
precision_norm               0.562857
recall_norm                  0.372401
f1_norm                      0.448237
rand_norm                    0.871574
tp_norm                    394.000000
tn_norm                   6189.000000
fp_norm                    306.000000
fn_norm                    664.000000
calinski_harabasz          885.715054
calinski_harabasz_norm     232.913479
Name: 12, dtype: float64