In [156]:
import pandas as pd
import numpy as np
import sys

from transformers import BertTokenizer, BertModel
import torch

import pandas as pd
pd.set_option('display.max_rows', 10)

In [2]:
news = pd.read_csv('results/news.csv')
news_tweets = pd.read_csv('results/news_tweets.csv')
# user_resp = pd.read_csv('results/user_response.csv')

In [45]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

stop_words = set(stopwords.words('english'))

exclude_words = ['donald', 'trump', 'joe', 'biden', 'president', 'ad']

def preprocess(txt):
    
    proc_sent = []
    for line in txt:
        word_tokens = word_tokenize(line)
        sent = [w for w in word_tokens if (not w.lower() in stop_words and not w.lower() in exclude_words)]
        proc_sent.append(" ".join(sent))
    
    return proc_sent

news['processed_title'] = preprocess(list(news.title))

In [46]:
news.columns

Index(['Unnamed: 0', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'id',
       'author', 'date', 'title', 'publication', 'trump_neg', 'trump_neu',
       'trump_pos', 'biden_neg', 'biden_neu', 'biden_pos', 'trump_flag',
       'biden_flag', 'vad_neg', 'vad_neu', 'vad_pos', 'vad_comp', 'text',
       'processed_text', 'topic_ids', 'topics', 'subtopic', 'trump_pos_new',
       'trump_neg_new', 'trump_neu_new', 'biden_pos_new', 'biden_neg_new',
       'biden_neu_new', 'processed_title'],
      dtype='object')

In [24]:
news[((news.title.str.contains('Trump')) & ~(news.title.str.contains('Biden')))].shape

(23795, 32)

In [49]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

#Sentences we want to encode. Example:

from sentence_transformers import SentenceTransformer
model_sent = SentenceTransformer('paraphrase-MiniLM-L6-v2')

model_sent.to(device)
#Sentences are encoded by calling model.encode()
embedding = model_sent.encode(news.processed_title)
embedding.shape

#model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased'

(35886, 384)

In [1]:
# news.processed_text.iloc[0]

In [76]:
def get_similarity(sent_emb, list1_index, list2_index):
    
    sim = []
    
    # print(list1.index)
    # print(news.iloc[list1.index])
    # print
    
    list1 = sent_emb[list1_index]
    list2 = sent_emb[list2_index]
    
    # sys.exit()
    for e1 in list1:
        cs_e1 = []
        for e2 in list2:
            # print(e1)
            # print(e2)
            cos_sim = 1 - spatial.distance.cosine(e1, e2)
            cs_e1.append(cos_sim)
        sim.append(cs_e1)

    return np.array(sim)

In [152]:
def get_L_R_news(news):
    
    R_news = news[((news['publication'] == 'Breitbart News') | (news['publication'] == 'Fox News'))]
    L_news = news[((news['publication'] == 'CNN') | (news['publication'] == 'The Washington Post'))]
    
    return R_news, L_news

threshold = 0.5
right_news, left_news, similarity = [], [], []
for i, grp in news.groupby('topics'):
    
    print(i, grp.shape)
    
    r_news, l_news = get_L_R_news(grp)
    
    T_R_news = r_news[((r_news.title.str.contains('Trump')) & ~(r_news.title.str.contains('Biden')))]
    T_L_news = l_news[((l_news.title.str.contains('Trump')) & ~(l_news.title.str.contains('Biden')))]
    
    # print(T_R_news)
    MatSim = get_similarity(embedding, T_R_news.index, T_L_news.index)
    fil_index = np.transpose((MatSim > threshold).nonzero())
    for f in fil_index:
        
        rn = T_R_news.iloc[f[0]].title
        ln = T_L_news.iloc[f[1]].title
        sim = MatSim[f[0], f[1]]
        right_news.append(rn)
        left_news.append(ln)
        similarity.append(sim)
        
    print(MatSim.shape)
    
    # print(len(right_news), len(left_news))
    # sys.exit()
    
    B_L_news = l_news[((l_news.title.str.contains('Biden')) & ~(l_news.title.str.contains('Trump')))]
    B_R_news = r_news[((r_news.title.str.contains('Biden')) & ~(r_news.title.str.contains('Trump')))]
    
    MatSim = get_similarity(embedding, B_R_news.index, B_L_news.index)
    fil_index = np.transpose((MatSim > threshold).nonzero())
    for f in fil_index:
        
        rn = B_R_news.iloc[f[0]].title
        ln = B_L_news.iloc[f[1]].title
        sim = MatSim[f[0], f[1]]
        right_news.append(rn)
        left_news.append(ln)
        similarity.append(sim)
        
    print(MatSim.shape)

abortion (255, 33)
(79, 31)
(83, 11)
capitol (1577, 33)
(390, 293)
(149, 96)
climate (655, 33)
(76, 154)
(138, 103)
court (1277, 33)
(413, 358)
(127, 34)
covid (5265, 33)
(1265, 1618)
(297, 159)
democrats (2039, 33)
(457, 199)
(592, 259)
economy (1305, 33)
(192, 272)
(100, 71)
election (4986, 33)
(1090, 1222)
(597, 285)
first_lady (365, 33)
(120, 89)
(20, 15)
floyd (1386, 33)
(495, 269)
(154, 35)
healthcare (215, 33)
(33, 67)
(19, 9)
hunter_biden (737, 33)
(28, 14)
(509, 32)
immigration (817, 33)
(261, 164)
(150, 32)
internatinal (2392, 33)
(732, 544)
(303, 109)
other (7457, 33)
(1691, 2086)
(1093, 484)
pres_debate (710, 33)
(103, 98)
(62, 21)
proud boys (212, 33)
(39, 69)
(8, 1)
republican (870, 33)
(201, 168)
(101, 42)
security (909, 33)
(249, 301)
(47, 27)
social_media (2457, 33)
(1098, 306)
(267, 45)


In [154]:
d = pd.DataFrame()
d['right'] = right_news
d['left'] = left_news
d['sim'] = similarity

In [175]:
d.sort_values(by=['sim'], ascending=False).to_csv('datasets/L-vs-R_news.csv')

In [174]:
pd.set_option('display.max_columns', None)
d[d.sim > 0.7].sort_values(by=['sim'], ascending=False)

Unnamed: 0,right,left,sim
6789,Trump Administration Asks Supreme Court to Inv...,Trump administration asks Supreme Court to inv...,1.000000
104677,Snapchat Permanently Bans President Donald Trump,Snapchat permanently bans President Trump,1.000000
64691,Bernie Sanders Endorses Joe Biden for President,Bernie Sanders endorses Joe Biden for president,1.000000
64655,Bernie Sanders Endorses Joe Biden for President,Bernie Sanders endorses Joe Biden for president,1.000000
64657,Bernie Sanders Endorses Joe Biden for President,Bernie Sanders endorses Joe Biden for president,1.000000
...,...,...,...
60879,Donald Trump: Rudy Giuliani Tested Positive fo...,Donald Trump Jr. tests positive for coronavirus,0.700136
91663,"Trump commutes Roger Stone's sentence, days be...",How Trump made the decision to commute Stone's...,0.700121
55438,Live updates: 2020 Election: Trump plans retur...,Trump's devious -- and brilliant -- coronaviru...,0.700087
33379,Pelosi dubs Trump 'Mr. Make Matters Worse' ove...,Trump says the coronavirus pandemic in the U.S...,0.700080


In [176]:
# np.transpose((MatSim>0.3).nonzero())

In [198]:
def get_dataset_with_labels(df, threshold):
    
    df = df[df.sim > threshold]
    
    df1 = df.sample(frac = 0.5)
    df2 = df.drop(df1.index)
    
    df1['label'] = 0
    df2 = df2.rename(columns = {'right': 'right_o', 'left': 'left_o'}).rename(columns = {'right_o': 'left', 'left_o': 'right'})
    df2['label'] = 1
    
    df_ = pd.concat((df1, df2), axis = 0)
    
    return df1, df2, df_

In [203]:
df1, df2, df_ = get_dataset_with_labels(d, 0.70)

In [209]:
df_.sample(frac = 1).to_csv('datasets/L-vs-R-news_labeled.csv')