In [1]:
import cudf
import numpy as np
import pandas as pd
import plotly.express as px

# init paths to data and models
DATA_PATH = '/data/'
MODEL_PATH = DATA_PATH + 'models/ft1_ru/'
SEM_MODEL_PATH = DATA_PATH + 'models/sm_ft1_ru/'
EVENTS_PATH = DATA_PATH + 'events/events_lem.csv'
TMP_PATH = DATA_PATH + 'tmp/sm2_128/'
N_SEM_CLUSTERS = 128
LABELS = 'labels.txt'

# plot pandas data frame with columns 'x', 'y', 'label', and 'title' 
def plot_df(df):
    fig = px.scatter(df, x="x", y="y", color='label', hover_name='title')
    fig.show()

'success'

'success'

init of events vectors and events 2d embeding for visualizaton

In [7]:
# amount of semantic classes in model
# read words and their semantic labels
with open(SEM_MODEL_PATH + 'labels.txt') as f:
    labels = list(map(lambda l: [l[0], int(l[1][:-1])], [line.split(',') for line in f]))
w2l = {} # word to label
for word, label in labels:
    w2l[word] = label 


# read events
df = pd.read_csv(EVENTS_PATH)
df['description'] = df['description'].apply(lambda s: ' '.join(filter(lambda w: w in w2l, s.split())))
df = df[df['description'] != '']

# calculate semantic vectors for events
X = []
for index, row in df.iterrows():
    vec = np.array([0] * N_SEM_CLUSTERS)
    words = list(filter(lambda w: w in w2l, row.description.split()))
    for word in words:
        vec[w2l[word]] += 1
    vec = vec / len(words)
    X.append(vec)
    if np.any(np.isnan(vec)):
        print(words)
X = np.array(X)

# normalization of semantic vectors
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

# calculate distances between events
from scipy.spatial import distance_matrix
dists = distance_matrix(X, X)

# calculate 2d embeding for events
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0, n_jobs=35, early_exaggeration=10, learning_rate=200)
X_2d = tsne.fit_transform(X) 
x_vals, y_vals = list(zip(*X_2d))
df['x'] = x_vals
df['y'] = y_vals

# saving results of this cell
np.save(TMP_PATH + 'X_norm.npy', X)
df.to_csv(r'' + TMP_PATH + 'df.csv', index=False)
np.save(TMP_PATH + 'dists.npy', dists)

print('completed')

completed


load data from tmp if events vectors were executed

In [2]:
df = pd.read_csv(TMP_PATH + 'df.csv')
X = np.load(TMP_PATH + 'X_norm.npy')
dists = np.load(TMP_PATH + 'dists.npy')

Calculate hopkins test for model. 
The closer the coefficient is to 0, the higher the clustering trend.
sample_sizes=1000 is enought for 8700 events

In [28]:
from pyclustertend import hopkins
sample_sizes = 1000
hopkins_ans = hopkins(X, sample_sizes)
print(hopkins_ans)

0.019076117031601593


KMeans clustering - god results for many clusters, more than 100

In [24]:
from cuml.cluster import KMeans

kmeans = KMeans(n_clusters=75, n_init=40)
kmeans.fit(X)
df['label'] = kmeans.labels_
centroids = kmeans.cluster_centers_

plot_df(df)

In [29]:
df.to_csv(r'' + TMP_PATH + 'df.csv', index=False)

organase map: label -> events

In [5]:
from collections import defaultdict


classes = defaultdict(list)
classes_id = defaultdict(list)
for index, row in df.iterrows():
    classes[row['label']].append(row['title'])
    classes_id[row['label']].append(index)
        

Filter small clusters and print other

In [6]:
threshold = 5
uncounted = 0
counted = 0

clean_classes = {}
for l in classes:
    if len(classes[l]) >= threshold:
        clean_classes[l] = classes[l]
        counted += len(classes[l])
    else:
        uncounted += len(classes[l])
        
    

print(f'length of cleaned clusters: {len(clean_classes)}')
print(f'unaccounted events: {uncounted} counted events: {counted}')

for l in clean_classes:
    print(l, len(clean_classes[l]),'\n', clean_classes[l], '\n\n\n')


length of cleaned clusters: 53
unaccounted events: 33 counted events: 8707
7 1332 
 ['#новыйгод', '#happynewyear', '#дворцоваяплощадь', '#happynewyear', '#эрмитаж', '#новаяголландия', '#новыйгод2019', '#новыйгод2019', '#новыйгод2019', '#newyear', '#newholland', '#hermitage', '#дворцоваяплощадь', '#новаяголландия', '#новаяголландия', '#newyear', '#новыйгод', '#эрарта', '#пилот', '#cathedral', '#исаакиевскийсобор', '#эрмитаж', '#эрмитаж', '#щелкунчикплющенко', '#lights', '#новыйгод', '#петропавловскаякрепость', '#эрмитаж', '#спаснакрови', '#петропавловскаякрепость', '#исаакиевскийсобор', '#петропавловскаякрепость', '#новаяголландия', '#дворцоваяплощадь', '#новыйгод', '#эрмитаж', '#эрмитаж', '@1lovefest', '#newholland', '#петропавловскаякрепость', '#новыйгод2019', '#эрмитаж', '#эрмитаж', '#эрарта', '#новаяголландия', '#эрмитаж', '#исаакиевскийсобор', '#планетарий1', '#дворцоваяплощадь', '#исаакиевскийсобор', '#исаакиевскийсобор', '#эрмитаж', '#спаснакрови', '#эрмитаж', '#новыйгод', '#рожд

 ['#эрмитаж', '#hermitage', '@solomun', '#исаакиевскийсобор', '#35mm'] 



40 7 
 ['#4х4', '#ladogatrophy', '@mira_ma_mira', '#aha', '#skoda', '#контраст', '#mercedesbenz'] 



57 10 
 ['#candela', '#iectc2019', '#iectc', '#iectc2019', '#театральнаяолимпиада', '#iectc2019', '#iectc2019', '@dr', '#patriciaspb', '#aasurgery'] 



20 5 
 ['#магазинспб', '#кросснации2019', '#зенит', '#петергоф', '#2020'] 



6 17 
 ['@karrina', '@karrina', '#казахи', '#сумкилюкс', '#дагивпитере', '#татарыспб', '@karrina', '#черкесы', '#чемоданывладивосток', '#даги', '#кабардинцы', '#черкесы', '#даргинцы', '#дагестан', '@assa_kavkazci_pitera', '#кабардинцы', '#грузины'] 





plot filtered clusters

In [None]:
df['clean_label'] = list(map(lambda l: l if l in clean_classes else 750, kmeans.labels_))


tsne = TSNE(n_components=2, random_state=0, n_jobs=35, early_exaggeration=10, learning_rate=200)
X_2d = tsne.fit_transform(X) 
x_vals, y_vals = list(zip(*X_2d))
df['x'] = x_vals
df['y'] = y_vals

plot_df(df.dropna())

In [None]:
from scipy.spatial import distance_matrix
dists = distance_matrix(X, X)

In [13]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def class_dist_max(a, b):
    max_dist = 0
    a_ind = range(len(classes_id[a]))
    b_ind = range(len(classes_id[b]))
    for i in a_ind:
        for j in b_ind:
            max_dist = max(max_dist, dists[i, j])
    return max_dist


def class_dist_min(a, b):
    min_dist = 1000000000
    a_ind = range(len(classes_id[a]))
    b_ind = range(len(classes_id[b]))
    for i in a_ind:
        for j in b_ind:
            min_dist = max(min_dist, dists[i, j])
    return min_dist


def class_dist_mean(a, b):
    sum_dists = 0
    a_ind = range(len(classes_id[a]))
    b_ind = range(len(classes_id[b]))
    for i in a_ind:
        for j in b_ind:
            sum_dists += dists[i, j]
    return sum_dists * (1 / (len(a_ind) * len(b_ind)))


def class_dist_centroid(a, b):
    return np.linalg.norm(centroids[a] - centroids[b])

def class_dist_centroid_norm(a, b):
    return np.linalg.norm(centroids[a] - centroids[b]) * (1 / (len(classes_id[a]) * len(classes_id[b])))



tests = [
    ['newyear', 408],
    ['newyear', 110],
    ['newyear', 475],
    ['зенит', 196],
    ['ска', 195],
    ['забег', 156],
    ['music', 109],
    ['music', 58],
    ['music', 43],
    ['hermitage', 477],
    ['hermitage', 188],
    ['hermitage', 7],
]
names = list(map(lambda t: f'{t[0]} {t[1]}', tests))

methods = [
    ["class_dist_max", class_dist_max],
    ["class_dist_min", class_dist_min],
    ["class_dist_mean", class_dist_mean],
    ["class_dist_centroid", class_dist_centroid], 
    ["class_dist_centroid_norm", class_dist_centroid_norm]
]


length = len(tests)

for l in range(len(methods)): 
    m = methods[l]
    distanses = np.zeros([length, length])
    for i in range(length):
        for j in range(i):
            if i == j:
                distanses[i, j] = -1
            else:
                distanses[i, j] = m[1](tests[i][1], tests[j][1])
                distanses[j, i] = distanses[i, j]
    
    fig = go.Figure(data=[go.Table(
        header=dict(values=[''] + names, fill_color='paleturquoise', align='left'),
        cells=dict(values=[names] + distanses.tolist(), fill_color='lavender', align='left'))])
    fig.show()
    

In [26]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

names = ['a', 'b', 'c', 'd']
distanses = np.array([[0, 1, 1, 2], [1, 0, 1, 3], [1, 1, 0, 4], [2, 3, 4, 0]])
    
fig = go.Figure(data=[go.Table(
    header=dict(values=[''] + names, fill_color='paleturquoise', align='left'),
    cells=dict(values=[names] + distanses.tolist(), fill_color='lavender', align='left'))])
fig.show()

MeansShift clustering - bad results

In [None]:
from sklearn.cluster import MeanShift

ms = MeanShift(bin_seeding=True, n_jobs=35).fit(X)
df['label'] = ms.labels_

plot_df(df)

from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(distance_threshold=100, n_clusters=None).fit(X)
df['label'] = ac.labels_

plot_df(df) bad result

Agglomerative Clustering - bad results

In [13]:
from sklearn.cluster import AgglomerativeClustering

#ac = AgglomerativeClustering(distance_threshold=100, n_clusters=None).fit(X)
ac = AgglomerativeClustering(n_clusters=212, linkage="ward").fit(X)
df['label'] = ac.labels_

plot_df(df)

DBSCAN clustering: find value for eps

In [None]:
from cuml.neighbors import NearestNeighbors
nn = NearestNeighbors(n_neighbors=2)
nn.fit(X)

# get 3 nearest neighbors
distances, indices = nn.kneighbors(X_cudf)
dists = list(distances[1])
dists.sort()

df_px = pd.DataFrame(list(zip(range(len(dists)), dists)), columns=['x', 'y'])
fig = px.line(df_px, x="x", y="y")
fig.show()

DBSCAN clustering: fit and plot - bad results

In [None]:
from cuml.cluster import DBSCAN
EPS = 35


dbscan = DBSCAN(eps=EPS, min_samples=3)
dbscan.fit(X_cudf)

df['label'] = dbscan.labels_

plot_df(df)

In [27]:
from sklearn.mixture import GaussianMixture

clf = GaussianMixture(n_components=41)
clf.fit(X)

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
                means_init=None, n_components=41, n_init=1,
                precisions_init=None, random_state=None, reg_covar=1e-06,
                tol=0.001, verbose=0, verbose_interval=10, warm_start=False,
                weights_init=None)