In [None]:
import os
TRAIN_DATA_PATH = 'data/mind_large_train/news.tsv'
TEST_DATA_PATH = 'data/mind_large_dev/news.tsv'
assert os.path.exists(TRAIN_DATA_PATH)
assert os.path.exists(TEST_DATA_PATH)



import pandas as pd

train_df = pd.read_csv(TRAIN_DATA_PATH, sep='\t', names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])
test_df =  pd.read_csv(TEST_DATA_PATH, sep='\t', names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])
train_df_clf = train_df[['news_id', 'category', 'title', 'abstract']]

train_df_clf['abstract'].fillna('', inplace=True)
train_df_clf['text'] = train_df_clf["title"] + train_df_clf['abstract']
assert train_df_clf.text.isna().sum() == 0

test_df_clf = test_df[['news_id', 'category', 'title', 'abstract']]
test_df_clf['abstract'].fillna('', inplace=True)
test_df_clf['title'].fillna('', inplace=True)
test_df_clf['text'] = test_df_clf["title"] + test_df_clf['abstract']


test_df_clf = test_df_clf.drop(columns=['title', 'abstract']).set_index('news_id')
train_df_clf = train_df_clf.drop(columns=['title', 'abstract']).set_index('news_id')

train_df_clf.drop_duplicates(inplace=True)
test_df_clf.drop_duplicates(inplace=True)


df_clf = pd.concat([train_df_clf,test_df_clf]).drop_duplicates()


In [None]:
import json
# with open('embs.json', 'w', encoding='utf-8') as f:
#     json.dump(embs, f, ensure_ascii=False, indent=4)

In [None]:
with open('embs.json', 'r', encoding='utf-8') as f:
    check = json.load(f)

In [None]:
from sentence_transformers import SentenceTransformer

model_emb = SentenceTransformer('all-MiniLM-L6-v2')
# tokenizer = AutoTokenizer.from_pretrained("it5/it5-base-news-summarization")
# model = AutoModelForSeq2SeqLM.from_pretrained("it5/it5-base-news-summarization")

In [None]:
df_clf = df_clf.reset_index()

In [None]:
diff = set(map(str, df_clf.index)) - set(list(check.keys()))
assert len(diff) < 4

In [None]:
for value in diff:
    check[value] = model_emb.encode(df_clf.loc[int(value)]['text']).tolist()

In [None]:
assert set(map(str, df_clf.index)) - set(list(check.keys())) == set()

In [None]:
df_clf['summarization_emb:float_seq'] = list(check.values())

In [None]:
df_clf

In [None]:
from sklearn.manifold import TSNE

In [None]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.io import output_notebook
output_notebook()

In [None]:
tsne = TSNE(n_components=2, random_state=0xDEAD)
embeddings_2d = tsne.fit_transform(df_clf['summarization_emb:float_seq'].tolist())

In [None]:
df_clf['embedding_2D_0'] = embeddings_2d[:, 0]
df_clf['embedding_2D_1'] = embeddings_2d[:, 1]

In [None]:
from bokeh.palettes import Category20
unique_genres = df_clf['category'].unique()
num_genres = len(unique_genres)

# Define the colormap
colormap = Category20[num_genres]
genre_colors = {genre: colormap[i % num_genres] for i, genre in enumerate(unique_genres)}

In [None]:
from bokeh.transform import factor_cmap
source = ColumnDataSource(df_clf.head(20000))

# Create the figure and add glyphs
p = figure(title='Summarization Embeddings Visualization', x_axis_label='Embedding Dimension 1', y_axis_label='Embedding Dimension 2')
p.scatter('embedding_2D_0', 'embedding_2D_1', source=source, color=factor_cmap('category', palette=colormap, factors=unique_genres))#{'field': 'category', 'transform': genre_colors})
hover = HoverTool(tooltips=[('Text', '@text'), ('Category', '@category')])
p.add_tools(hover)
show(p)

In [None]:
df_clf["summarization_emb:float_seq"] = df_clf["summarization_emb:float_seq"].apply(lambda row: ' '.join([str(x) for x in row]))

In [None]:
df_clf.drop(columns=["category", "text", "embedding_2D_0", "embedding_2D_1"], inplace=True)
df_clf = df_clf.rename({"news_id" : "sid:token"}, axis=1)
df_clf.to_csv('mind_large.sumemb', index=False, sep='\t')

In [None]:
df_clf.sample(3)