In [20]:
import pandas as pd
import numpy as np
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
trump = pd.read_csv('news.csv').drop_duplicates('title')
trump.pop('Unnamed: 0')
trump.head()

Unnamed: 0,authors,title,publish_date,description,text,url
0,['Thomson Reuters'],"U.S. states, hospitals plead for help as Trump...",2020-03-27 13:33:00,U.S. doctors and nurses on the front lines of ...,U.S. doctors and nurses on the front lines of ...,https://www.cbc.ca/news/world/us-coronavirus-v...
1,['The Associated Press'],A defiant Maduro threatens 'cowboy' Trump afte...,2020-03-27 09:41:00,Venezuelan President Nicolas Maduro is warning...,Venezuelan President Nicolas Maduro stood defi...,https://www.cbc.ca/news/world/a-defiant-maduro...
2,"['Politics', 'Éric Grenier Is A Senior Writer'...",Trudeau rates higher than Trump but lower than...,2020-03-27 08:00:00,Two-thirds of Canadians approve of the prime m...,Two-thirds of Canadians think Prime Minister J...,https://www.cbc.ca/news/politics/grenier-trude...
3,['Cbc News'],B.C. experimenting with malaria and Ebola medi...,2020-03-29 01:50:00,Provincial health officer says trials are unde...,HIV and malaria medications are being used to ...,https://www.cbc.ca/news/canada/british-columbi...
4,['The Associated Press'],Negotiators close on nearly $2 trillion virus ...,2020-03-23 14:04:00,Top congressional and White House officials em...,Top congressional and White House officials em...,https://www.cbc.ca/news/world/united-states-co...


In [3]:
model_USE = embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
print ("module loaded")
def embed(input):
    return model_USE(input)

module loaded


In [4]:
trump['title'][0]

'U.S. states, hospitals plead for help as Trump approves coronavirus aid bill'

In [5]:
message_embeddings = embed(list(trump['text'].apply(lambda x: np.array(x))))

In [6]:
text_use = embed([trump['text'][0]])

In [27]:
transpose_all = pd.DataFrame(columns = [])
transpose_all['title'] = trump['title']
transpose_all['text'] = trump['text']

transpose_all['similarity'] = np.dot(np.array(text_use),np.array(message_embeddings).T)[0]
all_doc = transpose_all.sort_values(by='similarity', ascending=False)[1:11]
all_doc

Unnamed: 0,title,text,similarity
3844,U.S. coronavirus cases now highest in the world,The number of confirmed COVID-19 cases in the ...,0.8381
3799,U.S. nears goal line in coronavirus stimulus n...,Democrats and Republicans said Tuesday they we...,0.811126
8,Trump declares national emergency to contain c...,U.S. President Donald Trump announced Friday t...,0.808866
3897,Anyone leaving New York must self-isolate or r...,"With around half of the country's infections, ...",0.806216
3926,"Coronavirus impact spreads across U.S., with m...","New York, which is experiencing more coronavir...",0.805208
4018,2nd coronavirus case in U.S. with no known sou...,U.S. health officials have confirmed the secon...,0.786701
19,Trump sharply restricts travel between Europe ...,"Taking dramatic action, President Donald Trump...",0.765031
4030,"California, New York and Illinois order reside...",New York and Illinois joined California on Fri...,0.743414
3948,U.S. political leaders deal with fallout — and...,As fears about the coronavirus outbreak roiled...,0.72665
30,"Trump downplays coronavirus risk, puts Pence i...",Donald Trump declared Wednesday that the U.S. ...,0.72279


In [28]:
for title , similarity in zip(all_doc['title'],all_doc['similarity']):
    print(title, round(similarity,2))

U.S. coronavirus cases now highest in the world 0.84
U.S. nears goal line in coronavirus stimulus negotiations, Senate leaders say 0.81
Trump declares national emergency to contain coronavirus 0.81
Anyone leaving New York must self-isolate or risk spreading COVID-19, health officials say 0.81
Coronavirus impact spreads across U.S., with much of the population under stay-at-home orders 0.81
2nd coronavirus case in U.S. with no known source confirmed in California 0.79
Trump sharply restricts travel between Europe and U.S. for 30 days amid coronavirus pandemic 0.77
California, New York and Illinois order residents to stay home to halt coronavirus 0.74
U.S. political leaders deal with fallout — and self-quarantine — in face of coronavirus outbreak 0.73
Trump downplays coronavirus risk, puts Pence in charge of U.S. response 0.72


In [9]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [10]:
sentence_embeddings = model.encode(trump['text'])

In [11]:
text_bert = model.encode(trump['text'][0])

In [29]:
transpose_bert = pd.DataFrame(columns = [])
transpose_bert['title'] = trump['title']
transpose_bert['text'] = trump['text']
transpose_bert['embedding'] = sentence_embeddings
transpose_bert['similarity'] = transpose_bert['embedding'].apply(lambda x: cosine_similarity(np.array(text_bert).reshape(1, -1),np.array(x).reshape(1, -1))[0][0])

all_doc_bert = transpose_bert.sort_values(by='similarity', ascending=False)[1:11]
all_doc_bert[['title','text','similarity']]

Unnamed: 0,title,text,similarity
8,Trump declares national emergency to contain c...,U.S. President Donald Trump announced Friday t...,0.69275
3844,U.S. coronavirus cases now highest in the world,The number of confirmed COVID-19 cases in the ...,0.662754
4008,Delayed U.S. response to coronavirus could hav...,The delayed response by the U.S. government to...,0.657534
3950,Federal cabinet considers leveraging military ...,As both Italy and the United States harness th...,0.655795
30,"Trump downplays coronavirus risk, puts Pence i...",Donald Trump declared Wednesday that the U.S. ...,0.637816
4004,"'This is a crisis': Biden, Sanders offer diffe...",Former U.S. vice-president Joe Biden and Vermo...,0.631674
10,Trump's fans were shrugging off COVID-19. Now ...,U.S. conservative media have shifted to a far ...,0.616327
4896,"Oil giants set work-at-home rules for offices,...",Major energy companies in the United States im...,0.615046
2497,What a Trump win means for U.S. health care,With elevated heart rates and rising blood pre...,0.604506
4280,How the U.S. Medicare generation squashed the ...,A generation of older American voters that alr...,0.604366


In [30]:
for title , similarity in zip(all_doc_bert['title'],all_doc_bert['similarity']):
    print(title, round(similarity,2))

Trump declares national emergency to contain coronavirus 0.69
U.S. coronavirus cases now highest in the world 0.66
Delayed U.S. response to coronavirus could have serious consequences, health experts say 0.66
Federal cabinet considers leveraging military in mobilizing health manufacturing sector against COVID-19 0.66
Trump downplays coronavirus risk, puts Pence in charge of U.S. response 0.64
'This is a crisis': Biden, Sanders offer differing COVID-19 plans at Democratic debate 0.63
Trump's fans were shrugging off COVID-19. Now it's a war, and he's their leader 0.62
Oil giants set work-at-home rules for offices, health checks for critical staff 0.62
What a Trump win means for U.S. health care 0.6
How the U.S. Medicare generation squashed the Democrats' Medicare for All candidate 0.6


In [31]:
vectorizer = TfidfVectorizer()
sentences_Tfidf = vectorizer.fit_transform(list(trump['text'])).toarray()
transpose_Tfidf = pd.DataFrame(columns = [])
transpose_Tfidf['title'] = trump['title']
transpose_Tfidf['text'] = trump['text']
transpose_Tfidf['similarity'] = np.dot(sentences_Tfidf[0],sentences_Tfidf.T)
all_doc_Tfidf = transpose_Tfidf.sort_values(by='similarity', ascending=False)[1:11]
all_doc_Tfidf[['title','text','similarity']]

Unnamed: 0,title,text,similarity
3844,U.S. coronavirus cases now highest in the world,The number of confirmed COVID-19 cases in the ...,0.620289
3799,U.S. nears goal line in coronavirus stimulus n...,Democrats and Republicans said Tuesday they we...,0.585307
4360,Coronavirus: Here's what's happening in Canada...,The latest: Ottawa finalizing plan to boost p...,0.582877
3939,Coronavirus: What's happening in Canada and ar...,The latest: Health minister says all travelle...,0.58096
4010,Coronavirus: What's happening in Canada and ar...,The latest: Number of COVID-19 cases in U.S. ...,0.577371
7,Coronavirus: What's happening in Canada and ar...,"The latest: Canada now has 5,655 confirmed an...",0.568576
3897,Anyone leaving New York must self-isolate or r...,"With around half of the country's infections, ...",0.56646
3968,Coronavirus: Here's what's happening in Canada...,The latest: PM addresses Canadians again afte...,0.564619
4227,Coronavirus: Here's what's happening on March 22,The latest: Yukon reports first 2 confirmed c...,0.563312
3997,Coronavirus: What's happening in Canada and ar...,The latest: House of Commons suspends proceed...,0.546398


In [32]:
for title , similarity in zip(all_doc_Tfidf['title'],all_doc_Tfidf['similarity']):
    print(title, round(similarity,2))

U.S. coronavirus cases now highest in the world 0.62
U.S. nears goal line in coronavirus stimulus negotiations, Senate leaders say 0.59
Coronavirus: Here's what's happening in Canada and around the world on March 20 0.58
Coronavirus: What's happening in Canada and around the world March 25 0.58
Coronavirus: What's happening in Canada and around the world on March 26 0.58
Coronavirus: What's happening in Canada and around the world on March 28 0.57
Anyone leaving New York must self-isolate or risk spreading COVID-19, health officials say 0.57
Coronavirus: Here's what's happening in Canada and the world March 19 0.56
Coronavirus: Here's what's happening on March 22 0.56
Coronavirus: What's happening in Canada and around the world on March 24 0.55


# Resources:

[Trump News Kaggle](https://www.kaggle.com/ryanxjhan/latest-5000-trump-news-coverage)

[BERT Sentence Transformers](https://github.com/UKPLab/sentence-transformers)

[Universal Sentence Encoder](https://tfhub.dev/google/universal-sentence-encoder/4)