In [4]:
import cudf
import numpy as np
import pandas as pd
import plotly.express as px

# init paths to data and models
DATA_PATH = '/data/'

SEM_MODEL_NAME = 'sm_ft3_ru_norm'
SEM_MODEL_PATH = DATA_PATH + f'models/{SEM_MODEL_NAME}/'
EVENTS_PATH = DATA_PATH + 'events/cleaned/events.csv'
TMP_PATH = f'{DATA_PATH}tmp/{SEM_MODEL_NAME}_new/'



# plot pandas data frame with columns 'x', 'y', 'label', and 'title' 
def plot_df(df, color='label'):
    fig = px.scatter(df, x="x", y="y", color=color, hover_name='title')
    fig.show()

'success'


# names of metrics in calc_scrore
calc_scrore_names = ['Calinski–Harabasz']

#samples = list(range(30, 101, 10)) + [75, 125, 128] 
samples = list(range(40, 130, 10)) + [150, 250, 500, 750, 1000]

In [13]:
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance_matrix
from sklearn.manifold import TSNE
!mkdir {TMP_PATH}

for n in samples:
    suffix = f'{n}/'
    path = TMP_PATH + suffix
    !mkdir {path}

    with open(f'{SEM_MODEL_PATH}{suffix}labels.txt') as f:
        labels = list(map(lambda l: [l[0], int(l[1][:-1])], filter(lambda l: len(l) == 2, [line.split(',') for line in f])))
    w2l = {} # word to label
    for word, label in labels:
        w2l[word] = label 


    # read events
    df = pd.read_csv(EVENTS_PATH)
    df['description'] = df['description'].apply(lambda s: ' '.join(filter(lambda w: w in w2l, s.split())))
    df = df[df['description'] != '']

    # calculate semantic vectors for events
    X = []
    for index, row in df.iterrows():
        vec = np.array([0] * n)
        words = list(filter(lambda w: w in w2l, row.description.split()))
        for word in words:
            vec[w2l[word]] += 1
        vec = vec / len(words)
        X.append(vec)
    X = np.array(X)

    # scaling of semantic vectors
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    

    # calculate 2d embeding for events
    tsne = TSNE(n_components=2, random_state=0, n_jobs=35, early_exaggeration=10, learning_rate=200)
    X_2d = tsne.fit_transform(X) 
    x_vals, y_vals = list(zip(*X_2d))
    df['x'] = x_vals
    df['y'] = y_vals

    # saving results of this cell
    np.save(path + 'X.npy', X)
    df.to_csv(r'' + path + 'df.csv', index=False)
    print(f'completed for {n}')

print('completed')

mkdir: cannot create directory ‘/data/tmp/sm_ft3_ru_new/’: File exists
mkdir: cannot create directory ‘/data/tmp/sm_ft3_ru_new/40/’: File exists
completed for 40
mkdir: cannot create directory ‘/data/tmp/sm_ft3_ru_new/50/’: File exists
completed for 50
mkdir: cannot create directory ‘/data/tmp/sm_ft3_ru_new/60/’: File exists
completed for 60
mkdir: cannot create directory ‘/data/tmp/sm_ft3_ru_new/70/’: File exists
completed for 70
mkdir: cannot create directory ‘/data/tmp/sm_ft3_ru_new/80/’: File exists
completed for 80
mkdir: cannot create directory ‘/data/tmp/sm_ft3_ru_new/90/’: File exists
completed for 90
mkdir: cannot create directory ‘/data/tmp/sm_ft3_ru_new/100/’: File exists
completed for 100
mkdir: cannot create directory ‘/data/tmp/sm_ft3_ru_new/110/’: File exists
completed for 110
mkdir: cannot create directory ‘/data/tmp/sm_ft3_ru_new/120/’: File exists
completed for 120
completed for 150
completed for 250
completed for 500
completed for 750
completed for 1000
completed


In [2]:
#from sklearn.metrics import davies_bouldin_score
#from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from cuml.cluster import KMeans

calc_scores_names = ['Calinski_Harabasz']
data_scores_names = ['precision', 'recall', 'f1', 'rand', 'tp', 'tn', 'fp', 'fn']
df_cross = pd.read_csv('cross_valid.csv')
lname = {'positive': 2, 'negative': 1}


# calulate scores for n clusters of events
# return list of calc_scrore_names scores
def calc_scrore(n):
    X_calc = X
    kmeans = KMeans(n_clusters=n, n_init=50).fit(X_calc)
    #dbs = davies_bouldin_score(X_calc, kmeans.labels_)
    #sils = silhouette_score(dists, kmeans.labels_)
    chs = calinski_harabasz_score(X_calc, kmeans.labels_)
    return kmeans.labels_, [chs]#[dbs, sils, chs]


# calculate scores for list of different numbers of clusters - ns
# return pandas DataFrame, where row contain n_clustres and different scores for this n_clustres
def n_scores(ns: list):
    columns = ['n_clusters'] + calc_scores_names 
    ans_scores = []
    
    for n in ns:
        l, scores = calc_scrore(n)
        df[str(n)] = l
        ans_scores.append([n] + scores) 

    df_scores = pd.DataFrame(ans_scores, columns=columns) 
    df_scores =  data_n_scores(ns, df_scores)
    return df_scores

def data_score(events):
    events = events.values.tolist()
    tp, tn, fp, fn = 0, 0, 0, 0
    d = {}
    for event, l in events:
        d[event] = l
    
    for _, row in df_cross.iterrows():
        a = row['id_a']
        b = row['id_b']
        l = row['label']
        
        tp += 1 if d[a] == d[b] and l == lname['positive'] else 0
        tn += 1 if d[a] != d[b] and l == lname['negative'] else 0
        fp += 1 if d[a] == d[b] and l == lname['negative'] else 0
        fn += 1 if d[a] != d[b] and l == lname['positive'] else 0
    
    #print(f'tp: {tp}\ntn: {tn}\nfp: {fp}\nfn: {fn}\n')
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)    
    rand = (tp + tn) / (tp + tn + fp + fn)
    #print(f'precision: {precision}\nrecall: {recall}\nf1: {f1}\n')
    return [precision, recall, f1, rand, tp, tn, fp, fn]
    
def data_n_scores(ns: list, df_scores):
    scores = []
    for n in ns:
        score = data_score(df[['id', str(n)]])
        scores.append(score)
    
    columns = data_scores_names
    transposed = list(zip(*scores))
    
    for i in range(len(columns)):
        df_scores[columns[i]] = transposed[i]
    return df_scores
        

In [3]:
n_samples = list(range(5, 101))

for n in samples:
    suffix = f'{n}/'
    path = TMP_PATH + suffix
    df = pd.read_csv(path + 'df.csv')
    X = np.load(path + 'X.npy')
    
    df_scores = n_scores(n_samples)
    df_scores.to_csv(r'' + path + 'scores.csv', index=False)
    df.to_csv(r'' + path + 'df.csv', index=False)
    print(f'completed for {n}')
    


completed for 40
completed for 50
completed for 60
completed for 70
completed for 80
completed for 90
completed for 100
completed for 110
completed for 120
completed for 150
completed for 250
completed for 500
completed for 750
completed for 1000


In [6]:
best = (0, 0, 0)
for n in samples:
    suffix = f'{n}/'
    path = TMP_PATH + suffix
    df_scores = pd.read_csv(path + 'scores.csv')
    
    
    best_score = df_scores.iloc[df_scores['f1'].idxmax()]
    best = (best_score['f1'], best_score['precision'], best_score['recall']) if best_score['f1'] > best[0] else best
    
best

(0.7285067873303166, 0.6388888888888888, 0.8473684210526315)

In [11]:
TMP_PATH

'/data/tmp/sm_ft3_ru_norm_new/'

In [None]:
best

In [10]:
df = pd.read_csv(f'{TMP_PATH}120/df.csv')
plot_df(df, '47')

In [13]:
t = [[1, 2], [3, 4], [5, 6]]
a, b = list(zip(*t))

print(a, b)

(1, 3, 5) (2, 4, 6)


In [None]:
def read_csv(model, n_sem):
    path = f'/data/tmp/{model}/{n_sem}/scores.csv'
    df_tmp = pd.read_csv(path)
    df_tmp['model'] = [f'{model}-{n_sem}'] * len(df_tmp) 
    return df_tmp

#models = ['sm_wv2', 'sm_wv2_ru', 'sm_wv2_ru_norm_enorm', 'sm_wv2_ru_enorm']
models = ['sm_ft3_ru_new']
#models = ['sm_wv2_ru']

n_sems = [40, 80, 100]

df_all = []
for n_sem in n_sems:
    df_all += [read_csv(m, n_sem) for m in models]
df_all = pd.concat(df_all)

for name in calc_scrore_names:
    fig = px.line(df_all, x="n_clusters", y=name, color='model')
    fig.show()

df_all

In [17]:
df_all['Calinski–Harabasz'][df_all['Calinski–Harabasz'] < 32]

76     31.880000
103    31.834922
107    31.959344
Name: Calinski–Harabasz, dtype: float64

In [5]:
from pyclustertend import hopkins
sample_sizes = 1000

for n in samples:
    suffix = f'{n}/'
    path = TMP_PATH + suffix
    df = pd.read_csv(path + 'df.csv')
    X = np.load(path + 'X_norm.npy')
    hopkins_ans = hopkins(X, sample_sizes)
    

hopkins_ans = hopkins(X, sample_sizes)
print(hopkins_ans)

0.10408719909229433


In [6]:
1 / np.exp(np.exp(1))

0.06598803584531254

In [53]:
tX = np.array([[1, 1], [1, 1.1], [1.1, 1], [2.1, 4], [2.2, 4], [2.3, 3.9]])
labels = [0,0,0,1,1,1]
print(SF(tX, labels))

#print(tX[0] * tX[3])

0.15171309392227822
0.7892067043697882
0.8491898489931238


In [38]:
from collections import defaultdict
from cuml.cluster import KMeans

def norm(a):
    return np.sqrt((a**2).sum())

def dist(a, b):
    return norm(a - b)

def bcd(C, C_avg, X):
    k_bcd = len(C)
    n_bcd = len(X)
    X_avg = X.sum(axis=0) / n_bcd 
    sum_bcd = 0
    for k in C:
        sum_bcd += len(C[k]) * dist(C_avg[k], X_avg)
    
    return sum_bcd / (k_bcd * n_bcd)
            
def wcd(C, C_avg, X):
    sum_wcd = 0
    for k in C:
        sum_wcd_k = 0
        for i in C[k]:
            sum_wcd_k += dist(X[i], C_avg[k])
        sum_wcd += (1 / len(C[k])) * sum_wcd_k
        #print(k, len(C[k]), sum_wcd_k)
    return sum_wcd

def SF(X, labels):
    C = defaultdict(list)
    for i in range(len(X)): 
        C[labels[i]].append(i)
    
    C_avg = {}
    for k in C:
        avg = np.zeros(X[0].shape)
        for i in C[k]:
            avg += X[i]
        C_avg[k] = avg / len(C[k])
    
    wcd_tmp = wcd(C, C_avg, X)
    bcd_tmp = bcd(C, C_avg, X)
    tmp = bcd_tmp - wcd_tmp
    print(wcd_tmp)
    print(bcd_tmp)
    return 1 - 1 / np.exp(np.exp(tmp))

In [46]:
samples = [50]
for n in samples:
    suffix = f'{n}/'
    path = TMP_PATH + suffix
    df = pd.read_csv(path + 'df.csv')
    X = np.load(path + 'X_norm.npy')
    dists = np.load(path + 'dists.npy')
    kmeans = KMeans(n_clusters=80, n_init=5).fit(X)
    
    print(SF(X, kmeans.labels_))
    

416.3654899780319
0.0507799718489548
0.0


In [54]:
button = widgets.Button(description='My Button')

NameError: name 'widgets' is not defined

In [75]:
from time import sleep
from IPython.display import display, Markdown, clear_output
import ipywidgets as widgets, Layout

def on_button_clicked(_):
    clear_output()
    button = widgets.Button(description='My Button3')
    display(button)

button1 = widgets.Button(description='My Button1')
button2 = widgets.Button(description='My Button2')
display(button)
button.on_click(on_button_clicked)

Button(description='My Button', style=ButtonStyle())

In [1]:
import time
import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output

button1 = widgets.Button(description='My Button1')
button2 = widgets.Button(description='My Button2')
out1 = widgets.Output()
out2 = widgets.Output()
ttt = ['d']
def on_button_clicked1(_):
    # "linking function with output"
    with out1:
        # what happens when we press the button
        clear_output()
        ttt += ['2']
        print(ttt)
        print('Somethjdlkghjdlgjdlkgjdlhjdlhkd;hks;lks;hk;lkg;lknjdflkhjgsojghjldgxdlkgjzkdgjlkcbjlc;bjclfkbjing happens!')
    with out2:
        print('\n\nsd')
        
def on_button_clicked2(_):
    # "linking function with output"
    with out2:
        # what happens when we press the button
        clear_output()
        print('Something happdjgkldjlgdgksdhvgijeghnk dghvkdghjkdjgjnhlkcfhbjlmfhbjfklhkmjm,kvnhmbjkcmhjnbkv,jhmbvfc,jhjhldjhldhens!')
# linking button and function together using a button's method
button1.on_click(on_button_clicked1)
button2.on_click(on_button_clicked2)
# displaying button and its output together

output = widgets.HBox([
    widgets.VBox([out1, button1], layout=widgets.Layout(width='50%')), 
    widgets.VBox([out2, button2], layout=widgets.Layout(width='50%'))
])

display(output)



HBox(children=(VBox(children=(Output(), Button(description='My Button1', style=ButtonStyle())), layout=Layout(…

In [99]:
t

0