In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import logging
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# init paths to data and models
DATA_PATH = '/data/'

MODEL_NAME = 'd2v_events_tags'

MODEL_PATH = DATA_PATH + f'models/{MODEL_NAME}/'
EVENTS_PATH = DATA_PATH + 'events/cleaned/events.csv'


TMP_PATH = f'{DATA_PATH}tmp/{MODEL_NAME}/'

#calc_scrore_names = ['Davies Bouldin', 'Silhouette', 'Calinski–Harabasz']
calc_scrore_names = ['Calinski–Harabasz']
def plot_df(df):
    fig = px.scatter(df, x="x", y="y", color='label', hover_name='title')
    fig.show()
    
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
df = pd.read_csv(EVENTS_PATH)
df['tags'] = df['tags'].apply(lambda s: (s[1:-1]).replace("'", "").split(","))
documents = []

for index, row in df.iterrows():
    documents.append(TaggedDocument(row.description.split(), [index] + row.tags[:5]))
    
print(f'loaded all events: {len(documents)')

In [None]:
model = Doc2Vec(documents, vector_size=100, window=30, min_count=4, negative=30, dm=1, dbow_words=1, epochs=500, workers=35)
!mkdir {MODEL_PATH}
model.save(MODEL_PATH + 'mdl')

In [2]:
model = Doc2Vec.load(MODEL_PATH + 'mdl')

2020-06-25 12:17:03,554 : INFO : loading Doc2Vec object from /data/models/d2v_events_tags/mdl
2020-06-25 12:17:03,989 : INFO : loading vocabulary recursively from /data/models/d2v_events_tags/mdl.vocabulary.* with mmap=None
2020-06-25 12:17:03,990 : INFO : loading trainables recursively from /data/models/d2v_events_tags/mdl.trainables.* with mmap=None
2020-06-25 12:17:03,991 : INFO : loading syn1neg from /data/models/d2v_events_tags/mdl.trainables.syn1neg.npy with mmap=None
2020-06-25 12:17:04,010 : INFO : loading wv recursively from /data/models/d2v_events_tags/mdl.wv.* with mmap=None
2020-06-25 12:17:04,011 : INFO : loading vectors from /data/models/d2v_events_tags/mdl.wv.vectors.npy with mmap=None
2020-06-25 12:17:04,030 : INFO : loading docvecs recursively from /data/models/d2v_events_tags/mdl.docvecs.* with mmap=None
2020-06-25 12:17:04,031 : INFO : loaded /data/models/d2v_events_tags/mdl


This cell prepare events vectors:
1. infering vectors for events by the doc2vec model (vectors for euclidean distance)
2. building normilized vectors for cosine distance
3. building 2d embedding by t-sne for euclidean distance and cosine distance
4. saving data to tmp dir

In [3]:
from sklearn.manifold import TSNE

df = pd.read_csv(EVENTS_PATH)
X = []
for index, row in df.iterrows():
    words = row.description.split()
    vec = model.infer_vector(words)
    X.append(vec)
# vectors for events with euclidean distance    
X = np.array(X)

# vectors for events with cosine distance    
X_norm = X / (((X ** 2).sum(axis=1)) ** (1/2)).reshape(X.shape[0], 1)

# calculate 2d embeding for events with euclidean distance 
tsne = TSNE(n_components=2, random_state=0, n_jobs=35, early_exaggeration=10, learning_rate=200)
df['x'], df['y'] = list(zip(*tsne.fit_transform(X)))

# calculate 2d embeding for events with cosine distance 
tsne = TSNE(n_components=2, random_state=0, n_jobs=35, early_exaggeration=10, learning_rate=200)
df['x_norm'], df['y_norm'] = list(zip(*tsne.fit_transform(X_norm)))

# saving results of this cell
!mkdir {TMP_PATH}
np.save(TMP_PATH + 'X.npy', X)
np.save(TMP_PATH + 'X_norm.npy', X_norm)
df.to_csv(r'' + TMP_PATH + 'df.csv', index=False)
print(f'completed')

mkdir: cannot create directory ‘/data/tmp/d2v_events_tags/’: File exists
completed


In [None]:
X = np.load(TMP_PATH + 'X.npy')
X_norm = np.load(TMP_PATH + 'X_norm.npy')
df = pd.read_csv(TMP_PATH + 'df.csv')

In [11]:
#from sklearn.metrics import davies_bouldin_score
#from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from cuml.cluster import KMeans

calc_scores_names = ['Calinski_Harabasz']
data_scores_names = ['precision', 'recall', 'f1', 'rand', 'tp', 'tn', 'fp', 'fn']


def make_norm_names(names: list):
    return list(map(lambda name: name + '_norm', names))


calc_scores_names_norm = make_norm_names(calc_scores_names)
data_scores_names_norm = make_norm_names(data_scores_names)

df_cross = pd.read_csv('cross_valid.csv')
lname = {'positive': 2, 'negative': 1}

# return (use_euclidean: bool, use_cosine: bool)
def make_metrics_flag(metric: str):
    return (True if metric != 'cosine' else False), (True if metric != 'euclidean' else False) 


# calulate scores for n clusters of events
# return list of calc_scrore_names scores
def calc_scrore(n, use_normilized):
    X_calc = X_norm if use_normilized else X
    kmeans = KMeans(n_clusters=n, n_init=50).fit(X_calc)
    #dbs = davies_bouldin_score(X_calc, kmeans.labels_)
    #sils = silhouette_score(dists, kmeans.labels_)
    chs = calinski_harabasz_score(X_calc, kmeans.labels_)
    return kmeans.labels_, [chs]#[dbs, sils, chs]


# calculate scores for list of different numbers of clusters - ns
# return pandas DataFrame, where row contain n_clustres and different scores for this n_clustres
def n_scores(ns: list, metric='euclidean'):
    use_euclidean, use_cosine = make_metrics_flag(metric)
    
    columns = ['n_clusters']         
    columns += calc_scores_names if use_euclidean else []
    columns += calc_scores_names_norm if use_cosine else []
    
    ans_scores = []
    
    for n in ns:
        scores = []
        if use_euclidean:
            l, scores_tmp = calc_scrore(n, False)
            scores += scores_tmp
            df[str(n)] = l
        
        if use_cosine:
            l, scores_tmp = calc_scrore(n, True)
            scores += scores_tmp
            df[str(n) + '_norm'] = l
        ans_scores.append([n] + scores) 

    df_scores = pd.DataFrame(ans_scores, columns=columns) 
    df_scores =  data_n_scores(ns, df_scores, metric)
    return df_scores

def data_score(events):
    events = events.values.tolist()
    tp, tn, fp, fn = 0, 0, 0, 0
    d = {}
    for event, l in events:
        d[event] = l
    
    for _, row in df_cross.iterrows():
        a = row['id_a']
        b = row['id_b']
        l = row['label']
        
        tp += 1 if d[a] == d[b] and l == lname['positive'] else 0
        tn += 1 if d[a] != d[b] and l == lname['negative'] else 0
        fp += 1 if d[a] == d[b] and l == lname['negative'] else 0
        fn += 1 if d[a] != d[b] and l == lname['positive'] else 0
    
    #print(f'tp: {tp}\ntn: {tn}\nfp: {fp}\nfn: {fn}\n')
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)    
    rand = (tp + tn) / (tp + tn + fp + fn)
    #print(f'precision: {precision}\nrecall: {recall}\nf1: {f1}\n')
    return [precision, recall, f1, rand, tp, tn, fp, fn]
    
def data_n_scores(ns: list, df_scores, metric='euclidean'):
    use_euclidean, use_cosine = make_metrics_flag(metric)
    scores = []
    for n in ns:
        score = []
        score += data_score(df[['id', str(n)]]) if use_euclidean else []
        score += data_score(df[['id', str(n) + '_norm']]) if use_cosine else []
        scores.append(score)
    
    columns = []
    columns += data_scores_names if use_euclidean else []
    columns += data_scores_names_norm if use_cosine else []
    
    transposed = list(zip(*scores))
    for i in range(len(columns)):
        df_scores[columns[i]] = transposed[i]
    return df_scores
        

In [12]:
n_samples = list(range(5, 101))

df_scores = n_scores(n_samples, metric='all')
df_scores.to_csv(r'' + TMP_PATH + 'scores.csv', index=False)
df.to_csv(r'' + TMP_PATH + 'df.csv', index=False)



          id  5
0      40526  2
1      40527  3
2      40528  4
3      40529  4
4      40530  2
...      ... ..
10209  15761  4
10210  15762  4
10211  15763  4
10212  15764  4
10213  15765  4

[10214 rows x 2 columns]
          id  5_norm
0      40526       0
1      40527       0
2      40528       0
3      40529       0
4      40530       0
...      ...     ...
10209  15761       1
10210  15762       1
10211  15763       1
10212  15764       1
10213  15765       1

[10214 rows x 2 columns]
          id  6
0      40526  1
1      40527  5
2      40528  1
3      40529  1
4      40530  1
...      ... ..
10209  15761  0
10210  15762  0
10211  15763  0
10212  15764  0
10213  15765  0

[10214 rows x 2 columns]
          id  6_norm
0      40526       5
1      40527       5
2      40528       5
3      40529       5
4      40530       5
...      ...     ...
10209  15761       2
10210  15762       2
10211  15763       2
10212  15764       2
10213  15765       2

[10214 rows x 2 columns]
        

          id  21
0      40526  15
1      40527  19
2      40528   4
3      40529   4
4      40530   4
...      ...  ..
10209  15761   1
10210  15762  12
10211  15763  12
10212  15764  12
10213  15765  12

[10214 rows x 2 columns]
          id  21_norm
0      40526        7
1      40527        4
2      40528        4
3      40529        4
4      40530        4
...      ...      ...
10209  15761        3
10210  15762       13
10211  15763        9
10212  15764       13
10213  15765       13

[10214 rows x 2 columns]
          id  22
0      40526  13
1      40527   8
2      40528   4
3      40529  19
4      40530  13
...      ...  ..
10209  15761  19
10210  15762   9
10211  15763  20
10212  15764   9
10213  15765   9

[10214 rows x 2 columns]
          id  22_norm
0      40526       10
1      40527       16
2      40528       16
3      40529       16
4      40530       16
...      ...      ...
10209  15761       18
10210  15762       21
10211  15763       18
10212  15764       21
10213  1

          id  37
0      40526   7
1      40527  15
2      40528   6
3      40529  25
4      40530  20
...      ...  ..
10209  15761  17
10210  15762  18
10211  15763  16
10212  15764  18
10213  15765  18

[10214 rows x 2 columns]
          id  37_norm
0      40526       28
1      40527       20
2      40528       20
3      40529       20
4      40530       20
...      ...      ...
10209  15761       11
10210  15762       11
10211  15763       18
10212  15764       16
10213  15765        6

[10214 rows x 2 columns]
          id  38
0      40526  32
1      40527  13
2      40528   5
3      40529   5
4      40530   5
...      ...  ..
10209  15761  13
10210  15762  15
10211  15763  21
10212  15764  21
10213  15765  14

[10214 rows x 2 columns]
          id  38_norm
0      40526       17
1      40527       36
2      40528       36
3      40529       36
4      40530       36
...      ...      ...
10209  15761       28
10210  15762       31
10211  15763       35
10212  15764       27
10213  1

          id  53_norm
0      40526       38
1      40527       48
2      40528       17
3      40529       17
4      40530       17
...      ...      ...
10209  15761       51
10210  15762        6
10211  15763       27
10212  15764       44
10213  15765        1

[10214 rows x 2 columns]
          id  54
0      40526   7
1      40527  42
2      40528  41
3      40529  23
4      40530  41
...      ...  ..
10209  15761  20
10210  15762  12
10211  15763  31
10212  15764  27
10213  15765  48

[10214 rows x 2 columns]
          id  54_norm
0      40526       12
1      40527       34
2      40528       34
3      40529       34
4      40530       34
...      ...      ...
10209  15761       25
10210  15762        8
10211  15763       32
10212  15764       24
10213  15765       19

[10214 rows x 2 columns]
          id  55
0      40526  38
1      40527  50
2      40528  32
3      40529  51
4      40530  10
...      ...  ..
10209  15761  50
10210  15762   5
10211  15763  30
10212  15764  30
102

          id  69_norm
0      40526        4
1      40527       26
2      40528       59
3      40529       59
4      40530       59
...      ...      ...
10209  15761       30
10210  15762       67
10211  15763       55
10212  15764       37
10213  15765       42

[10214 rows x 2 columns]
          id  70
0      40526  52
1      40527  42
2      40528   7
3      40529   5
4      40530   7
...      ...  ..
10209  15761  11
10210  15762  43
10211  15763   6
10212  15764  16
10213  15765  63

[10214 rows x 2 columns]
          id  70_norm
0      40526       42
1      40527       66
2      40528        3
3      40529        3
4      40530        3
...      ...      ...
10209  15761       38
10210  15762       30
10211  15763       32
10212  15764        1
10213  15765       18

[10214 rows x 2 columns]
          id  71
0      40526  17
1      40527  22
2      40528  64
3      40529  64
4      40530  64
...      ...  ..
10209  15761   5
10210  15762   4
10211  15763  37
10212  15764  54
102

          id  86
0      40526  33
1      40527   4
2      40528  11
3      40529  11
4      40530  11
...      ...  ..
10209  15761   4
10210  15762   5
10211  15763  55
10212  15764  44
10213  15765  44

[10214 rows x 2 columns]
          id  86_norm
0      40526       72
1      40527       15
2      40528       47
3      40529       47
4      40530       47
...      ...      ...
10209  15761       29
10210  15762       13
10211  15763       78
10212  15764       83
10213  15765       42

[10214 rows x 2 columns]
          id  87
0      40526  53
1      40527  65
2      40528  83
3      40529  83
4      40530  83
...      ...  ..
10209  15761   8
10210  15762  21
10211  15763   7
10212  15764  33
10213  15765   7

[10214 rows x 2 columns]
          id  87_norm
0      40526       11
1      40527       61
2      40528       26
3      40529       26
4      40530       26
...      ...      ...
10209  15761       85
10210  15762       60
10211  15763       64
10212  15764       74
10213  1

In [13]:
best_score = df_scores.iloc[df_scores['f1'].idxmax()]
best = (best_score['f1'], best_score['n_clusters'])
best_score_norm = df_scores.iloc[df_scores['f1_norm'].idxmax()]
best_norm = (best_score_norm['f1_norm'], best_score_norm['n_clusters'])
print(f'euclidean: {best}\ncosine:    {best_norm}')

euclidean: (0.5210918114143922, 10.0)
cosine:    (0.5825688073394495, 9.0)


In [None]:
from cuml.cluster import KMeans

kmeans = KMeans(n_clusters=10, n_init=10)
kmeans.fit(X)
df['label'] = kmeans.labels_
centroids = kmeans.cluster_centers_



plot_df(df)

In [None]:
scores = []

for n_clusters in df.columns:
    if not '_norm' in n_clusters:
        continue    
    scores.append(evaluate_score(df[['id', n_clusters]]))
    
p, r, f1, rand, tp, tn, fp, fn = list(zip(*scores))

df_scores['precision_norm'] = p
df_scores['recall_norm'] = r
df_scores['f1_norm'] = f1
df_scores['rand_norm'] = rand
df_scores['tp_norm'] = tp
df_scores['tn_norm'] = tn
df_scores['fp_norm'] = fp
df_scores['fn_norm'] = fn
#df_scores.to_csv(r'' + TMP_PATH + 'scores.csv', index=False)

best_score = df_scores.iloc[df_scores['f1_norm'].idxmax()]
best_norm = (best_score['f1_norm'], best_score['n_clusters'])


In [None]:
best_norm

In [None]:
evaluate_score(df[['id', '11_norm']])

In [None]:
evaluate_score(df[['id', '11']])

In [None]:
df_scores.columns

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_scores['n_clusters'], y=df_scores['f1'], name='euclid'))
fig.add_trace(go.Scatter(x=df_scores['n_clusters'], y=df_scores['f1_norm'], name='cosine'))
fig.show()

In [None]:
def read_csv(model):
    path = f'/data/tmp/{model}/scores.csv'
    df_tmp = pd.read_csv(path)
    df_tmp['model'] = [f'{model}-{n_sem}'] * len(df_tmp) 
    df_tmp['model_tmp'] = [tmp[model]] * len(df_tmp) 
    return df_tmp

#models = ['sm_wv2', 'sm_wv2_ru', 'sm_wv2_ru_norm_enorm', 'sm_wv2_ru_enorm']
#models = ['d2v_ru']
#models = ['sm_wv2_ru']




for name in calc_scrore_names + ['f1']:
    fig = px.line(df_scores, x="n_clusters", y=name)
    fig.show()

In [None]:
for name in calc_scrore_names + ['f1']:
    fig = px.line(df_scores, x="n_clusters", y=name)
    fig.show()

In [None]:
from pyclustertend import hopkins
sample_sizes = 1000
hopkins_ans = hopkins(X, sample_sizes)
print(hopkins_ans)