## Word Embedding Analysis

In [41]:
import pandas as pd, json, csv, uuid
from util import get_embedding
from tqdm import tqdm
import boto3

from datetime import datetime
from util import get_embedding

## Get Wordnet pair and their embedding

In [33]:
# https://stackoverflow.com/questions/24192979/how-to-generate-a-list-of-antonyms-for-adjectives-in-wordnet-using-python
from nltk.corpus import wordnet as wn

antonyms = []
for i in wn.all_synsets():
    if i.pos() in ['a']: 
        for j in i.lemmas(): # Iterating through lemmas for each synset.
            if j.antonyms(): # If adj has antonym.
                # Prints the adj-antonym pair.
                antonyms.append((j.name(), j.antonyms()[0].name()))

# sort antonyms
antonyms_set = set()

for x in antonyms:
    if x[0] < x[1]:
        antonyms_set.add((x[0], x[1]))
    else:
        antonyms_set.add((x[1], x[0]))

In [38]:
df_antonyms = pd.DataFrame(list(antonyms_set), columns = ["adj_1", "adj_2"])
df_antonyms

Unnamed: 0,adj_1,adj_2
0,comparable,incomparable
1,proved,unproved
2,fatty,nonfat
3,disposable,nondisposable
4,shrinkable,unshrinkable
...,...,...
1823,appealing,unappealing
1824,precocious,retarded
1825,breathing,breathless
1826,analogue,digital


In [45]:
tqdm.pandas()
df_antonyms["adj_1_embedding"] = df_antonyms.progress_apply(lambda x : get_embedding(x["adj_1"]), axis = 1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1828/1828 [09:35<00:00,  3.18it/s]


In [47]:
df_antonyms["adj_2_embedding"] = df_antonyms.progress_apply(lambda x : get_embedding(x["adj_2"]), axis = 1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1828/1828 [09:55<00:00,  3.07it/s]


In [50]:
df_antonyms.to_parquet("./data/df_antonyms.parquet")

## Get Headline embedding

File for creating word embedding

In [3]:
df = pd.read_parquet("./data/headlines/data_domestic_news_Jan_Feb_1.parquet")
df

Unnamed: 0,url,headline,datePublished_site,probability,date_collected,siteName,NE,embedding
0,https://abcnews.go.com/International/surprise-...,Queen of Denmark says she will abdicate her th...,,0.737869,2024010100,ABCNews(Online),[Denmark],"[0.017008154, -0.024301104, -0.009747937, -0.0..."
1,https://abcnews.go.com/US/man-dies-shark-encou...,Man dies after shark encounter at Maui's Paia Bay,,0.899625,2024010100,ABCNews(Online),"[Maui, Paia Bay]","[-0.016129471, -0.02739735, 0.009082557, 0.020..."
2,https://abcnews.go.com/Sports/wireStory/report...,Former cycling world champ charged in wife's d...,,0.934976,2024010100,ABCNews(Online),[],"[0.023653883, -0.008396673, -0.010792003, 0.02..."
3,https://abcnews.go.com/US/wireStory/judge-allo...,Judge rejects NAACP request to bar new court,,0.931576,2024010100,ABCNews(Online),[NAACP],"[0.023608714, -0.0006530128, -0.0013944306, 0...."
4,https://abcnews.go.com/International/russia-la...,Russia launches drone assault after Ukraine at...,,0.912954,2024010100,ABCNews(Online),"[Russia, Ukraine]","[-0.03651164, -0.008954718, -0.0038743906, -0...."
...,...,...,...,...,...,...,...,...
887980,https://www.npr.org/2024/01/23/1226291911/osca...,"'Oppenheimer' dominates the Oscar nominations,...",2024-01-23T15:55:14.065277,0.943489,2024012310,NPR(OnlineNews),"[Opheimer, Oscar, Gerwig]","[0.010090228, 0.046056487, -0.012394347, -0.00..."
887991,https://www.npr.org/2021/08/08/1024674033/ther...,The emotional rollercoaster of being a new mom,,0.754495,2024012310,NPR(OnlineNews),[],"[-0.0322049, -0.032232568, 0.003192131, -0.033..."
888009,https://nypost.com/2024/01/22/news/3-kansas-ci...,3 Kansas City Chiefs fans found days after fre...,,0.949277,2024012310,NewYorkPost(News),[Kansas City Chiefs],"[0.01873689, -0.014486628, -0.0067774444, -0.0..."
888010,https://nypost.com/2024/01/23/news/locals-nerv...,Locals 'worried' as supercar-loving Minn. bill...,,0.930251,2024012310,NewYorkPost(News),[],"[0.014685243, -0.0006382814, -0.015646959, -0...."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 0 to 888011
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   url                 93829 non-null   object 
 1   headline            99999 non-null   object 
 2   datePublished_site  30139 non-null   object 
 3   probability         100000 non-null  float64
 4   date_collected      100000 non-null  object 
 5   siteName            100000 non-null  object 
 6   NE                  100000 non-null  object 
 7   embedding           100000 non-null  object 
dtypes: float64(1), object(7)
memory usage: 6.9+ MB


In [29]:
embedding_hedlines = df["headline"].dropna().unique()
embedding_hedlines

array(['Queen of Denmark says she will abdicate her throne',
       "Man dies after shark encounter at Maui's Paia Bay",
       "Former cycling world champ charged in wife's death", ...,
       'No, international court did not find Israel…',
       "The heartbreaking family stories shared at the Senate's Big Tech hearing",
       'New Mexico will not charge police officers who fatally shot man at wrong address'],
      dtype=object)

In [33]:
def send_message_to_sqs(queue_url, messages):
    sqs = boto3.client('sqs')

    messages_batch = []
    for x in tqdm(messages):
        if len(messages_batch) >= 10:
            # send message
            response = sqs.send_message_batch(
                QueueUrl=queue_url,
                Entries=messages_batch)
    
            # empty the batch
            messages_batch = []
    
        message = {"Id" : str(uuid.uuid4()), "MessageBody" : x}
        messages_batch.append(message)

    # send the remaining 
    sqs.send_message_batch(QueueUrl=queue_url, Entries=messages_batch)

In [32]:
queue_url = 'headline_embedding'

send_message_to_sqs(queue_url, embedding_hedlines)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 140017/140017 [04:05<00:00, 569.33it/s]

[{'Id': 'd444703e-c13a-4cfb-b1c9-477c4d2c1ce4', 'MessageBody': 'Check the Powerball winning numbers from…'}, {'Id': 'ad78dfe7-346a-49db-8e28-b9b6f5bcd9a9', 'MessageBody': 'Hulu is about to ban password sharing: What you…'}, {'Id': '7ae5626f-7626-4381-a69e-67b63798c0ad', 'MessageBody': 'In Supreme Court mifepristone case, junk science…'}, {'Id': 'a947b844-6996-4b80-abf7-72582a2ee358', 'MessageBody': 'Packers hire college head coach as new defensive…'}, {'Id': '88ef2585-7ac6-42b1-bf79-5a9c23915e40', 'MessageBody': 'No, international court did not find Israel…'}, {'Id': 'e34704b4-ed7d-4cf9-ae21-5415478e7879', 'MessageBody': "The heartbreaking family stories shared at the Senate's Big Tech hearing"}, {'Id': '4c0c6f1c-83f4-4dd6-b3b5-731a15c0554e', 'MessageBody': 'New Mexico will not charge police officers who fatally shot man at wrong address'}]





In [9]:
# Specify the format of the date string
date_format = "%Y%m%d%H"
max_datetime = datetime(year=2024, month=2, day=1, hour=00, minute=00, second=0)

datetime.strptime(date_string, date_format)

2024-02-01 00:00:00


## Merge to df

In [4]:
df = pd.read_parquet("./data/headlines/data_domestic_news_Jan_Feb.parquet")
df

Unnamed: 0,url,headline,datePublished_site,probability,date_collected,siteName,NE
0,https://abcnews.go.com/International/surprise-...,Queen of Denmark says she will abdicate her th...,,0.737869,2024010100,ABCNews(Online),[Denmark]
1,https://abcnews.go.com/US/man-dies-shark-encou...,Man dies after shark encounter at Maui's Paia Bay,,0.899625,2024010100,ABCNews(Online),"[Maui, Paia Bay]"
2,https://abcnews.go.com/Sports/wireStory/report...,Former cycling world champ charged in wife's d...,,0.934976,2024010100,ABCNews(Online),[]
3,https://abcnews.go.com/US/wireStory/judge-allo...,Judge rejects NAACP request to bar new court,,0.931576,2024010100,ABCNews(Online),[NAACP]
4,https://abcnews.go.com/International/russia-la...,Russia launches drone assault after Ukraine at...,,0.912954,2024010100,ABCNews(Online),"[Russia, Ukraine]"
...,...,...,...,...,...,...,...
1177990,https://www.usatoday.com/story/news/world/2024...,New study says fish discovered 3 years ago can...,,0.763477,2024022922,USATODAY,[]
1177991,https://www.usatoday.com/story/entertainment/t...,"Black astronaut Ed Dwight, reflects on 'The Sp...",,0.722521,2024022922,USATODAY,"[Black, Ed Dwight, The, Space Race]"
1177992,https://www.usatoday.com/story/news/nation/202...,"'Life-threatening' conditions, up to 10 feet o...",,0.685995,2024022922,USATODAY,[]
1178008,https://www.usatoday.com/videos/news/politics/...,"Biden, Trump visit US-Mexico border on same day",,0.448134,2024022922,USATODAY,"[Biden, Trump, US, Mexico]"


In [8]:
dynamodb = boto3.resource('dynamodb')
table_name = 'news_headlines_embeddings' 
table = dynamodb.Table(table_name)

def get_embedding(headline, table = table):
    try:
        response = table.get_item(
        Key={'headline': headline})

        # Check if the item was found and print it
        item = response['Item']
        return eval(item["embedding"])
    except:
        # key not found or None key
        # print(headline)
        return [0.0] * 3072

In [13]:
# df_1 = df.iloc[:100000]
# df_2 = df.iloc[100000:200000]
df_3 = df.iloc[200000:]

In [14]:
df_3

Unnamed: 0,url,headline,datePublished_site,probability,date_collected,siteName,NE
551326,https://www.breitbart.com/politics/2024/02/14/...,Karine Jean-Pierre: Republicans Siding with 'T...,,0.871194,2024021416,BreitbartNews,"[Karin, Jean, Pierre, Republicans, Trump, Puti..."
551330,https://www.buzzfeednews.com/article/leylamoha...,After Ryan Reynolds Hilariously Trolled Her Su...,,0.978313,2024021416,BuzzFeedNews,"[Ryan Reynolds, Super Bowl, Blake Lively, Cha]"
551331,https://www.buzzfeednews.com/article/leylamoha...,A Bunch Of Former Nickelodeon Cast And Crew Me...,,0.987414,2024021416,BuzzFeedNews,[Nickelodeon]
551332,https://www.buzzfeednews.com/article/stephanie...,Beyoncé’s Management Apparently Called And Tha...,,0.980903,2024021416,BuzzFeedNews,"[Beyoncé, Country Song]"
551368,https://www.cbsnews.com/news/kansas-city-chief...,1 killed in shooting after Kansas City Chiefs ...,2024-02-14T21:40:14.219898,0.970697,2024021416,CBSNews(Online),"[Kansas City Chiefs, Super Bowl]"
...,...,...,...,...,...,...,...
1177990,https://www.usatoday.com/story/news/world/2024...,New study says fish discovered 3 years ago can...,,0.763477,2024022922,USATODAY,[]
1177991,https://www.usatoday.com/story/entertainment/t...,"Black astronaut Ed Dwight, reflects on 'The Sp...",,0.722521,2024022922,USATODAY,"[Black, Ed Dwight, The, Space Race]"
1177992,https://www.usatoday.com/story/news/nation/202...,"'Life-threatening' conditions, up to 10 feet o...",,0.685995,2024022922,USATODAY,[]
1178008,https://www.usatoday.com/videos/news/politics/...,"Biden, Trump visit US-Mexico border on same day",,0.448134,2024022922,USATODAY,"[Biden, Trump, US, Mexico]"


In [15]:
tqdm.pandas()

df_3["embedding"] = df_3.progress_apply(lambda x : get_embedding(x["headline"]), axis = 1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67636/67636 [17:58<00:00, 62.73it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3["embedding"] = df_3.progress_apply(lambda x : get_embedding(x["headline"]), axis = 1)


In [18]:
df_3.to_parquet("./data/headlines/data_domestic_news_Jan_Feb_3.parquet")

In [17]:
df_3

Unnamed: 0,url,headline,datePublished_site,probability,date_collected,siteName,NE,embedding
551326,https://www.breitbart.com/politics/2024/02/14/...,Karine Jean-Pierre: Republicans Siding with 'T...,,0.871194,2024021416,BreitbartNews,"[Karin, Jean, Pierre, Republicans, Trump, Puti...","[0.004906118, -0.018238168, -0.03428877, -0.01..."
551330,https://www.buzzfeednews.com/article/leylamoha...,After Ryan Reynolds Hilariously Trolled Her Su...,,0.978313,2024021416,BuzzFeedNews,"[Ryan Reynolds, Super Bowl, Blake Lively, Cha]","[0.0017802261, -0.0017181633, -0.01915458, -0...."
551331,https://www.buzzfeednews.com/article/leylamoha...,A Bunch Of Former Nickelodeon Cast And Crew Me...,,0.987414,2024021416,BuzzFeedNews,[Nickelodeon],"[-0.018629, -0.03645424, -0.019808706, -0.0122..."
551332,https://www.buzzfeednews.com/article/stephanie...,Beyoncé’s Management Apparently Called And Tha...,,0.980903,2024021416,BuzzFeedNews,"[Beyoncé, Country Song]","[0.012244982, -0.036140528, -0.007203586, -0.0..."
551368,https://www.cbsnews.com/news/kansas-city-chief...,1 killed in shooting after Kansas City Chiefs ...,2024-02-14T21:40:14.219898,0.970697,2024021416,CBSNews(Online),"[Kansas City Chiefs, Super Bowl]","[0.0069411634, -0.019468537, 0.007470071, -0.0..."
...,...,...,...,...,...,...,...,...
1177990,https://www.usatoday.com/story/news/world/2024...,New study says fish discovered 3 years ago can...,,0.763477,2024022922,USATODAY,[],"[-0.019353423, -0.0029539436, -0.0129284, -0.0..."
1177991,https://www.usatoday.com/story/entertainment/t...,"Black astronaut Ed Dwight, reflects on 'The Sp...",,0.722521,2024022922,USATODAY,"[Black, Ed Dwight, The, Space Race]","[-0.00038863573, -0.021946488, -0.021370115, -..."
1177992,https://www.usatoday.com/story/news/nation/202...,"'Life-threatening' conditions, up to 10 feet o...",,0.685995,2024022922,USATODAY,[],"[-0.01681874, -0.009240714, -0.002482043, 0.02..."
1178008,https://www.usatoday.com/videos/news/politics/...,"Biden, Trump visit US-Mexico border on same day",,0.448134,2024022922,USATODAY,"[Biden, Trump, US, Mexico]","[0.017860102, 0.00048904907, -0.012666605, -0...."
