In [None]:
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show, output_file, save
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.io import output_notebook
from bokeh.palettes import Category20
import pandas as pd
from bokeh.transform import factor_cmap
import gc

output_notebook()

In [2]:
import os
TRAIN_DATA_PATH = 'data/mind_large_train/news.tsv'
TEST_DATA_PATH = 'data/mind_large_dev/news.tsv'
assert os.path.exists(TRAIN_DATA_PATH)
assert os.path.exists(TEST_DATA_PATH)



import pandas as pd

train_df = pd.read_csv(TRAIN_DATA_PATH, sep='\t', names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])
test_df =  pd.read_csv(TEST_DATA_PATH, sep='\t', names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])
train_df.drop(columns=['url', 'title_entities', 'abstract_entities', 'abstract'], inplace=True)
test_df.drop(columns=['url', 'title_entities', 'abstract_entities', 'abstract'], inplace=True)


df = pd.concat([train_df, test_df]).drop_duplicates()
del train_df, test_df
gc.collect()

In [9]:
EMBEDDINGS_PATH = 'data/preprocessed/mind_small/mind_small.sumemb'
emb_df = pd.read_csv(EMBEDDINGS_PATH, sep='\t')

df_with_emb = emb_df.merge(df, how='left', left_on='sid:token', right_on='news_id')
assert not df_with_emb.isna().sum().sum()

del emb_df
gc.collect()

14

In [10]:
unique_genres = df['category'].unique()
num_genres = len(unique_genres)

colormap = Category20[num_genres]
genre_colors = {genre: colormap[i % num_genres] for i, genre in enumerate(unique_genres)}

In [23]:
import numpy as np
def visualize_embeddings(df: pd.DataFrame, title: str,
                         embedding_col: str, 
                         embedding_count: int=500,
                         subcategory: str=None):

    df_tmp = df.head(embedding_count).copy(deep=True)
    df_tmp[embedding_col] = df_tmp[embedding_col].apply(lambda row: np.asarray([float(number) for number in row.split()], dtype=float))
    tsne = TSNE(init='pca', learning_rate='auto',
                n_components=2, 
                random_state=0xDEAD)
    embeddings_2d = tsne.fit_transform(np.asarray(df_tmp[embedding_col].tolist()))
    print('TSNE done')

    df_tmp['embedding_2D_0'] = embeddings_2d[:, 0]
    df_tmp['embedding_2D_1'] = embeddings_2d[:, 1]

    output_file(filename=f"{'_'.join(title.lower().split())}.html", title=title)
    source = ColumnDataSource(df_tmp)

    # Create the figure and add glyphs
    p = figure(title=title, x_axis_label='Embedding Dimension 1', y_axis_label='Embedding Dimension 2')
    p.scatter('embedding_2D_0', 'embedding_2D_1', source=source, color=factor_cmap('category', palette=colormap, factors=unique_genres))
    hover = HoverTool(tooltips=[('Title', '@title'), ('Category', '@category')])
    p.add_tools(hover)
    show(p)
    print("Saving")
    save(p)
    gc.collect()

In [None]:
visualize_embeddings(df_with_emb, 'Summarization Embeddings Visualization', 'summarization_emb:float_seq', 20000)

In [28]:
EMBEDDINGS_PATH = 'data/preprocessed/mind_small/mind_small.newsemb'
emb_df = pd.read_csv(EMBEDDINGS_PATH, sep='\t')

df_with_emb = emb_df.merge(df, how='left', left_on='nid:token', right_on='news_id')
assert not df_with_emb.isna().sum().sum()

del emb_df
gc.collect()
df_with_emb.sample(1)

Unnamed: 0,nid:token,title_emb:float_seq,news_id,category,subcategory,title
46426,N27924,-0.030792 -0.050139500000000004 -0.0137535 -0....,N27924,sports,basketball_nba,"Karl-Anthony Towns, Joel Embiid trade shots on..."


In [None]:
visualize_embeddings(df_with_emb, 'Title Entities (MEAN) Embeddings Visualization', 'title_emb:float_seq', 20000)

In [31]:
EMBEDDINGS_PATH = 'data/preprocessed/mind_small/mind_small.absemb'
emb_df = pd.read_csv(EMBEDDINGS_PATH, sep='\t')

df_with_emb = emb_df.merge(df, how='left', left_on='aid:token', right_on='news_id')
assert not df_with_emb.isna().sum().sum()

del emb_df
gc.collect()
df_with_emb.sample(1)

Unnamed: 0,aid:token,abstract_emb:float_seq,news_id,category,subcategory,title
39629,N78545,-0.004432 0.0089 0.020402 0.063929 -0.038808 0...,N78545,video,news,Tucker: You're going to be seeing a lot of Eli...


In [None]:
visualize_embeddings(df_with_emb, 'Abstract Entities (MEAN) Embeddings Visualization', 'abstract_emb:float_seq', 20000)

In [34]:
EMBEDDINGS_PATH = 'data/preprocessed/mind_small/mind_small.clfembb'
emb_df = pd.read_csv(EMBEDDINGS_PATH, sep='\t')

df_with_emb = emb_df.merge(df, how='left', left_on='cid:token', right_on='news_id')
assert not df_with_emb.isna().sum().sum()

del emb_df
gc.collect()
df_with_emb.sample(1)

Unnamed: 0,cid:token,clf_emb:float_seq,news_id,category,subcategory,title
26117,N87883,-0.7142041921615601 0.39405933022499084 0.2282...,N87883,sports,basketball_ncaa,River Hill field hockey falls to Urbana in sta...


In [None]:
visualize_embeddings(df_with_emb, 'Classification Embeddings Visualization', 'clf_emb:float_seq', 20000)