In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

# init paths to data and models
DATA_PATH = '/data/'

# plot pandas data frame with columns 'x', 'y', 'label', and 'title' 
def plot_df(df):
    fig = px.scatter(df, x="x", y="y", color='label', hover_name='title')
    fig.show()

'success'


# names of metrics in calc_scrore
calc_scrore_names = ['Calinski–Harabasz', 'f1', 'precission', 'recall']

samples = list(range(40, 130, 10)) + [128]

In [2]:
calc_scrore_names = ['Davies Bouldin', 'Silhouette', 'Calinski–Harabasz']

In [3]:
import plotly.graph_objects as go

def read_csv(model, n_sem):
    path = f'/data/tmp/{model}/{n_sem}/scores.csv'
    df_tmp = pd.read_csv(path)
    df_tmp['model'] = [f'{model}-{n_sem}'] * len(df_tmp) 
    return df_tmp


def plot_sem_comparing(models, n_sems, scores):
    df_all = []
    for n_sem in n_sems:
        df_all += [read_csv(m, n_sem) for m in models]
    df_all = pd.concat(df_all)
    for name in scores:
        fig = px.line(df_all, x="n_clusters", y=name, color='model')
        fig.show()
    
def plot_pair_index(model, n_sem):
    plot_df = read_csv(model, n_sem)
    x = plot_df['n_clusters']
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x, y=plot_df['precision'], name='precission'))
    fig.add_trace(go.Scatter(x=x, y=plot_df['recall'], name='recall'))
    fig.add_trace(go.Scatter(x=x, y=plot_df['f1'], name='f1'))
    
    fig2 = go.Figure()
    fig2.add_trace(go.Scatter(x=x, y=plot_df['tp'], name='true positive'))
    fig2.add_trace(go.Scatter(x=x, y=plot_df['fn'], name='false negative'))
    
    fig3 = go.Figure()
    fig3.add_trace(go.Scatter(x=x, y=plot_df['tn'], name='true negative'))
    fig3.add_trace(go.Scatter(x=x, y=plot_df['fp'], name='false positive'))
    
    fig.show()
    fig2.show()
    fig3.show()



In [11]:
models = ['sm_wv3_ru']
n_sems = [50, 80, 100]
plot_sem_comparing(models, n_sems, calc_scrore_names)

In [12]:
models = ['sm_ft3_ru']
n_sems = [50, 80, 100]
plot_sem_comparing(models, n_sems, calc_scrore_names)

In [17]:
models = ['sm_wv2_ru', 'sm_wv2_ru_norm']
n_sems = [100]
plot_sem_comparing(models, n_sems, calc_scrore_names[:1])

In [7]:
models = ['sm_wv3_ru']
n_sems = [40, 80, 120]
plot_sem_comparing(models, n_sems, calc_scrore_names[:2])

In [19]:
models = ['sm_ft3_ru_new', 'sm_wv3_ru_new']
n_sems = [70, 100]
plot_sem_comparing(models, n_sems, calc_scrore_names[:2])

In [5]:
models = { 'sm_wv3_ru_new/70/': 75, 
    'sm_ft3_ru_new/100/': 143,
    'sm_wv3_ru_new/100/': 167,
    'sm_ft3_ru_new/70/': 143
}


In [11]:
df_cross = pd.read_csv('cross_valid.csv')
lname = {'positive': 2, 'negative': 1}

def build_set(events):
    events = events.values.tolist()
    tp = set()
    tn = set()
    fp = set()
    fn = set()
    d = {}
    for event, l in events:
        d[event] = l
    
    for _, row in df_cross.iterrows():
        a = row['id_a']
        b = row['id_b']
        l = row['label']
        
        if d[a] == d[b] and l == lname['positive']:
            tp |= set([(a, b)])
        if d[a] != d[b] and l == lname['negative']:
            tn |= set([(a, b)])
        if d[a] == d[b] and l == lname['negative']:
            fp |= set([(a, b)])
        if d[a] != d[b] and l == lname['positive']:
            fn |= set([(a, b)])
    
    return [tp, tn, fp, fn]

tp_global = set()
tn_global = set()
fp_global = set()
fn_global = set()
is_inited = False
for m in models:
    df = pd.read_csv(f'{DATA_PATH}/tmp/{m}df.csv')
    df_scores = pd.read_csv(f'{DATA_PATH}/tmp/{m}scores.csv')
    
    tp, tn, fp, fn = build_set(df[['id', str(models[m])]])
    if not is_inited:
        tp_global = tp
        tn_global = tn
        fp_global = fp
        fn_global = fn
        is_inited = True
    else:
        tp_global &= tp
        tn_global &= tn
        fp_global &= fp
        fn_global &= fn
    

In [13]:
len(fn_global)

46

In [24]:
EVENTS_PATH = DATA_PATH + 'events/cleaned/events.csv'
df = pd.read_csv(EVENTS_PATH)
df['tags'] = df['tags'].apply(lambda s: (s[1:-1]).replace("'", "").split(","))

d = {}
for _, row in df.iterrows():
    d[row['id']] = { 'tags' : row['tags'], 'text': row['description']}

In [25]:
fn_list = list(fn_global)
ind = 0

In [38]:
from IPython.display import clear_output
clear_output(wait=True)

a = fn_list[ind][0]
b = fn_list[ind][1]
print(f"{ind}\n{a}\n{d[a]['tags']}\n\n{b}\n{d[b]['tags']}\n\n{d[a]['text']}\n\n{d[b]['text']}")
ind += 1

11
13697
['порнофильмы', '@punkmovies', '@pfvolodya', '@alexander', '@pfdrummer', '@santsancho', ' молодостьипанкрок', ' володякотляров', ' адреналинстадиум', ' oi', '@pfmerch', ' пф', '@leokray17', ' дядяволодя', ' хой', '@punkrocktrumpet', ' этопройдет', ' панки', ' панк', ' punk', '@slavaseleznev', ' allstartv', '@adrenaline_stadium']

41901
['монеточка', '@monetochkaliska', ' раскраскидлявзрослых', ' а2greenconcert', ' лизамонеточка', ' а2', ' каждыйраз', ' лиза', ' monetochkaliska', ' заря', ' мамаянезигую', ' назаре']

 кто все эти люди nсолдаут в адреналине часа драйва и бешеной энергии это был настоящий кайф punkmovies спасибо за настолько крутой концерт любовь к punkmovies дошла до сломанного железного забора и порванной футболки pfvolodya спасибо punkmovies за это безумие n n порнофильмы прости прощай привет n порнофильмы порнофильмы володякотляров punkmovies santsancho pfvolodya alexander rusakov pfdrummer девчонки вы готовы n нищая страна n пацаны n ты нам на й не нужна n п

In [29]:
plot_pair_index('sm_ft3_ru_new', 70)

In [45]:
df = pd.read_csv(DATA_PATH + 'tmp/sm_ft3_ru_new/80/df.csv')

fig = px.scatter(df, x="x", y="y", color='29', hover_name='title')
fig.show()

fig2 = px.scatter(df, x="x", y="y", color='30', hover_name='title')
fig2.show()

In [5]:
from pyclustertend import hopkins
sample_sizes = 1000

for n in samples:
    suffix = f'{n}/'
    path = TMP_PATH + suffix
    df = pd.read_csv(path + 'df.csv')
    X = np.load(path + 'X_norm.npy')
    hopkins_ans = hopkins(X, sample_sizes)
    

hopkins_ans = hopkins(X, sample_sizes)
print(hopkins_ans)

0.10408719909229433
