In [1]:
import torch
import pytorch_lightning as pl
import numpy as np
from gensim.models import Word2Vec
from train import Lightning

In [2]:
w2v = Word2Vec.load('data/w2v/news_200d_small.bin')
model = Lightning.load_from_checkpoint('lightning_logs/final/version_2/checkpoints/epoch=1.ckpt')
doc_encoder = model.nrms.doc_encoder
w2id = {key: w2v.wv.vocab[key].index for key in w2v.wv.vocab}
id2w = {v: k for k, v in w2id.items()}


In [3]:
doc_encoder = model.nrms.doc_encoder

In [4]:
# -*- coding: utf-8 -*-
# @Author: Jie Yang
# @Date:   2019-03-29 16:10:23
# @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
# @Last Modified time: 2019-04-12 09:56:12


## convert the text/attention list to latex code, which will further generates the text heatmap based on attention weights.
import numpy as np

latex_special_token = ["!@#$%^&*()"]

def generate(text_list, attention_list, latex_file, color='red', rescale_value = True):
	assert(len(text_list) == len(attention_list))
	if rescale_value:
		attention_list = rescale(attention_list)
	word_num = len(text_list)
	text_list = clean_word(text_list)
	with open(latex_file,'w') as f:
		f.write(r'''\documentclass[varwidth]{standalone}
\special{papersize=210mm,297mm}
\usepackage{color}
\usepackage{tcolorbox}
\usepackage{CJK}
\usepackage{adjustbox}
\tcbset{width=0.9\textwidth,boxrule=0pt,colback=red,arc=0pt,auto outer arc,left=0pt,right=0pt,boxsep=5pt}
\begin{document}
\begin{CJK*}{UTF8}{gbsn}'''+'\n')
		string = r'''{\setlength{\fboxsep}{0pt}\colorbox{white!0}{\parbox{0.9\textwidth}{'''+"\n"
		for idx in range(word_num):
			string += "\\colorbox{%s!%s}{"%(color, attention_list[idx])+"\\strut " + text_list[idx]+"} "
		string += "\n}}}"
		f.write(string+'\n')
		f.write(r'''\end{CJK*}
\end{document}''')

def rescale(input_list):
	the_array = np.asarray(input_list)
	the_max = np.max(the_array)
	the_min = np.min(the_array)
	rescale = (the_array - the_min)/(the_max-the_min)*100
	return rescale.tolist()


def clean_word(word_list):
	new_word_list = []
	for word in word_list:
		for latex_sensitive in ["\\", "%", "&", "^", "#", "_",  "{", "}"]:
			if latex_sensitive in word:
				word = word.replace(latex_sensitive, '\\'+latex_sensitive)
		new_word_list.append(word)
	return new_word_list

In [5]:
title = 'Best PS5 games top PlayStation 5 titles to look forward to'.split()
idx = torch.tensor([w2id.get(w.lower(), 0) for w in title])
o, score,_ = doc_encoder(idx.unsqueeze(0))
score = score.squeeze().detach()
score = score.tolist()
generate(title, score, "AANR.tex", 'red')

In [6]:
title = '100 Genius Tips that Will Make Your Holidays So Much Better'.split()
idx = torch.tensor([w2id.get(w.lower(), 0) for w in title])
o, score= doc_encoder(idx.unsqueeze(0))
score = score.squeeze().detach()
score = score.tolist()
generate(title, score, "sample.tex", 'red')

In [7]:
import pandas as pd
news_df = pd.read_csv('data/small_dev/news.tsv', sep='\t', header=None)
news_df.columns = ['news_id', 'category', 'sub_category', 'title', 'abstract', 'url', 'title_entity', 'abstract_entity']
list(news_df['title'][50:80])

[&#39;Today in History: November 2&#39;,
 &quot;25 Photos of the Royal Family at Balmoral Castle, Queen Elizabeth&#39;s Favorite Home&quot;,
 &#39;These Cranberry Sauce Recipes Are Perfect for Thanksgiving Dinner&#39;,
 &#39;Phil Fulmer blasts NCAA after key Tennessee freshman denied eligibility&#39;,
 &quot;Prince George&#39;s Royal Life in Photos&quot;,
 &#39;Can you answer these real Jeopardy questions about TV shows?&#39;,
 &quot;Randall&#39;s Rant: Is Tiger already the GOAT without 18 majors?&quot;,
 &#39;The 25 most desirable places to live in the US in 2019&#39;,
 &quot;40+ Stuffed Pasta Recipes You&#39;ll Want To Make Every Night&quot;,
 &quot;&#39;Bachelor&#39;s Amanda Stanton Is Dating &#39;Rich Kids of Beverly Hills&#39; Star Brendan Fitzpatrick&quot;,
 &quot;I&#39;m Sorry, But Please Do Not Put Your Purse On My Couch&quot;,
 &quot;It&#39;s Not All About the Corgis - Here Are the Royal Family&#39;s Other Beloved Pets&quot;,
 &#39;2020 Toyota Supra GT4 details announced, make