# Word Embedding Analysis

https://medium.com/@Suraj_Yadav/understanding-intra-cluster-distance-inter-cluster-distance-and-dun-index-a-comprehensive-guide-a8de726f5769

## load data

In [3]:
import pandas as pd
import glob
from tqdm import tqdm

import json
from datetime import datetime

In [4]:
# Path to the directory containing your Parquet files
parquet_files = glob.glob('./data/headlines/embedding/*.parquet')

# Read each Parquet file into a DataFrame and store them in a list
dfs = [pd.read_parquet(file) for file in parquet_files]

# Concatenate all DataFrames in the list into a single DataFrame
df_word_embedding = pd.concat(dfs, ignore_index=True)

In [5]:
# convert datetime
df_word_embedding["date_collected"] = df_word_embedding.apply(lambda x : datetime.strptime(x["date_collected"][:8], "%Y%m%d"), axis = 1)

In [6]:
df_word_embedding.memory_usage(deep=True).sum() / (1024 ** 2)

172.87728595733643

In [None]:
file_path = './data/site_leanings.json'

with open(file_path, 'r') as file:
    site_leaning = json.load(file)

In [None]:
l = [k for k, v in site_leaning.items() if v == "right" ]
len(l)

In [None]:
site_leaning.values

In [None]:
for k, v in site_leaning.items():
    print(v)

## Wordnet Antonyms

## Comparision between Clusters

In [12]:
from util import *

In [13]:
# Path to your JSON file
file_path = './data/site_leanings.json'

# Loading the JSON content from the file
with open(file_path, 'r') as file:
    left_or_right = json.load(file)

In [14]:
tqdm.pandas()
df_word_embedding["leaning"] = df_word_embedding.progress_apply(lambda x : left_or_right[x["siteName"]], axis = 1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 267636/267636 [00:02<00:00, 96801.63it/s]


In [15]:
# Initialize an empty dictionary to store the centroids
centroids = {}

# Iterate over each group
for group_name, group_df in df_word_embedding.groupby('leaning'):
    # Drop the 'group' column to only include embedding dimensions
    group_df = group_df.drop('leaning', axis=1)
    
    # Calculate centroid for the current group
    centroid = cluster_centroid(group_df["embedding"])
    
    # Store the centroid using the group name as the key
    centroids[group_name] = centroid

In [16]:
left_to_right_vec = centroids["right"] - centroids["left"]

In [9]:
df_antonyms = pd.read_parquet("data/df_antonyms.parquet")
df_antonyms

Unnamed: 0,adj_1,adj_2,adj_1_embedding,adj_2_embedding
0,comparable,incomparable,"[-0.019301222637295723, 0.011061019264161587, ...","[0.012978984974324703, -0.0027543564792722464,..."
1,proved,unproved,"[0.016204241663217545, 0.016003673896193504, -...","[0.015760842710733414, 0.0006610580021515489, ..."
2,fatty,nonfat,"[-0.006109761074185371, 0.018375856801867485, ...","[-0.0030756292399019003, 0.006370946299284697,..."
3,disposable,nondisposable,"[-0.06539838016033173, 0.002771256258711219, 0...","[-0.046912819147109985, -0.0005513812648132443..."
4,shrinkable,unshrinkable,"[-0.019847901538014412, -0.008843732997775078,...","[-0.001083246199414134, -0.022068507969379425,..."
...,...,...,...,...
1823,appealing,unappealing,"[-0.00962099339812994, 0.022533150389790535, -...","[-0.006122448015958071, -0.01592993177473545, ..."
1824,precocious,retarded,"[0.009357798844575882, 0.02012326754629612, -0...","[-0.008591502904891968, 0.004101067315787077, ..."
1825,breathing,breathless,"[-0.0037388750351965427, 0.01762058027088642, ...","[0.004579761065542698, 0.022734813392162323, 0..."
1826,analogue,digital,"[-0.026716582477092743, -0.009638152085244656,...","[-0.01176784373819828, 0.012267342768609524, -..."


In [69]:
def calc_scale_leaning_similarity(x, left_to_right_vec):
    scale_vec = x["adj_1_embedding"] - x["adj_2_embedding"]
    return abs(get_cos_similiarity(scale_vec, left_to_right_vec))

In [75]:
# no general pattern
df_antonyms["scale_leaning_similarity_general"] = df_antonyms.apply(lambda x : calc_scale_leaning_similarity(x, left_to_right_vec), axis = 1)
df_antonyms.sort_values(by='scale_leaning_similarity', ascending=False).head(30)

Unnamed: 0,adj_1,adj_2,adj_1_embedding,adj_2_embedding,scale_leaning_similarity
1246,topless,topped,"[0.023438846692442894, -0.02973170392215252, -...","[-0.011330758221447468, 0.0009694399195723236,...",0.154941
74,exploited,unexploited,"[-0.014310876838862896, -0.020294489338994026,...","[-0.00015983838238753378, 0.001522005768492817...",0.153948
505,alcoholic,nonalcoholic,"[0.0101753119379282, -0.0029956118669360876, -...","[0.022629011422395706, -0.005034586414694786, ...",0.145844
603,immodest,modest,"[-0.027822980657219887, 0.002553241793066263, ...","[-0.03664353862404823, 0.032205674797296524, -...",0.129883
283,concentric,eccentric,"[0.015682131052017212, 0.022552788257598877, -...","[0.02611115761101246, -0.00598837248980999, -0...",0.129735
103,nonwoody,woody,"[0.0008258971502073109, 0.004422285128384829, ...","[0.0053856149315834045, 0.012899205088615417, ...",0.128246
515,publishable,unpublishable,"[-0.002544285962358117, 0.005828400142490864, ...","[-0.01697021722793579, -0.03667968511581421, -...",0.127214
812,bloodless,bloody,"[0.04127216711640358, -0.023229777812957764, 0...","[0.028820184990763664, -0.007292380556464195, ...",0.124714
917,disorderly,orderly,"[-0.0015191871207207441, 0.009042780846357346,...","[-0.002776528475806117, 0.00683887954801321, -...",0.123073
393,backhand,forehand,"[-0.011304805986583233, -0.017085092142224312,...","[-0.013314391486346722, -0.0032293039839714766...",0.120625


## Comparision between Biden related Terms

In [21]:
def filter_df_by_NE(df, NE):
    filtered_df = df[df['NE'].apply(lambda x: len(set(x) & set(NE)) > 0)]
    return filtered_df

def get_left_to_right_vec(df):
    centroids = {}
    for group_name, group_df in df.groupby('leaning'):
        # Drop the 'group' column to only include embedding dimensions
        group_df = group_df.drop('leaning', axis=1)
        
        # Calculate centroid for the current group
        centroid = cluster_centroid(group_df["embedding"])
        
        # Store the centroid using the group name as the key
        centroids[group_name] = centroid

    left_to_right_vec = centroids["right"] - centroids["left"]
    return left_to_right_vec

def calc_scale_leaning_similarity(x, left_to_right_vec):
    scale_vec = x["adj_1_embedding"] - x["adj_2_embedding"]
    return get_cos_similiarity(scale_vec, left_to_right_vec)
    
def calc_scale_leaning_similarity_df(df_word_embedding, df_antonyms):
    left_to_right_vec = get_left_to_right_vec(df_word_embedding)
    result = []
    for x in df_antonyms.apply(lambda x : [x["adj_1"],x["adj_2"], calc_scale_leaning_similarity(x, left_to_right_vec)], axis = 1):
        if x[2] > 0:
            result.append([x[1], x[0], x[2]])
        else:
            result.append([x[0], x[1], x[2]])

    return pd.DataFrame(result, columns = ["Left_adj", "Right_adj", "scale_leaning_similarity"])

df_biden = filter_df_by_NE(df_word_embedding, ["Biden", "Joe Biden"])
calc_scale_leaning_similarity_df(df_biden, df_antonyms).sort_values(by='scale_leaning_similarity', ascending=False).head(30)

Unnamed: 0,Left_adj,Right_adj,scale_leaning_similarity
1717,respectful,disrespectful,0.194835
1707,pro-American,anti-American,0.184464
603,modest,immodest,0.183473
125,integrative,disintegrative,0.15818
542,orienting,disorienting,0.157783
1467,warmhearted,coldhearted,0.155875
1716,moderate,immoderate,0.154373
85,polite,impolite,0.153785
164,pleasing,displeasing,0.153371
988,honorable,dishonorable,0.152497


In [22]:
df_trump = filter_df_by_NE(df_word_embedding, ["Trump", "Donald Trump"])
calc_scale_leaning_similarity_df(df_trump, df_antonyms).sort_values(by='scale_leaning_similarity', ascending=False).head(30)

Unnamed: 0,Left_adj,Right_adj,scale_leaning_similarity
835,substantive,adjective,0.149455
971,unowned,owned,0.118999
625,nonbearing,bearing,0.113781
94,hearing,deaf,0.111707
1581,unobvious,obvious,0.110554
1443,unmade,made,0.107712
1289,uncoated,coated,0.104946
1334,undemocratic,democratic,0.104555
380,unenthusiastic,enthusiastic,0.103918
1322,white-collar,blue-collar,0.101765


In [23]:
file_path = './data/dem_rep_NEs.json'

with open(file_path, 'r') as file:
    dem_or_rep = json.load(file)

In [24]:
df_dem = filter_df_by_NE(df_word_embedding,dem_or_rep["Dem"])
calc_scale_leaning_similarity_df(df_dem, df_antonyms).sort_values(by='scale_leaning_similarity', ascending=False).head(30)

Unnamed: 0,Left_adj,Right_adj,scale_leaning_similarity
603,modest,immodest,0.173465
1717,respectful,disrespectful,0.172078
1707,pro-American,anti-American,0.157856
1716,moderate,immoderate,0.154319
1467,warmhearted,coldhearted,0.152904
505,nonalcoholic,alcoholic,0.147991
1362,temperate,intemperate,0.141153
74,unexploited,exploited,0.140025
435,unprovocative,provocative,0.137234
267,tolerable,intolerable,0.135585


In [25]:
df_rep = filter_df_by_NE(df_word_embedding,dem_or_rep["Rep"])
calc_scale_leaning_similarity_df(df_rep, df_antonyms).sort_values(by='scale_leaning_similarity', ascending=False).head(30)

Unnamed: 0,Left_adj,Right_adj,scale_leaning_similarity
835,substantive,adjective,0.155476
625,nonbearing,bearing,0.124878
971,unowned,owned,0.122348
1581,unobvious,obvious,0.118718
1289,uncoated,coated,0.115633
1443,unmade,made,0.115161
380,unenthusiastic,enthusiastic,0.111952
817,uncommitted,committed,0.111749
259,passionless,passionate,0.1108
1334,undemocratic,democratic,0.107882


In [26]:
# antonyms may lack context

In [41]:
# Sample DataFrame creation
np.random.seed(42)  # For reproducible output

# Function to sample n_samples from each group
def sample_n_from_group(df, n_samples):
    sampled_df = df.groupby('leaning').apply(lambda x : x.sample(n=n_samples, replace=True)).reset_index(drop=True)
    return sampled_df

In [42]:
sample_n_from_group(df_Haley, 100)

  sampled_df = df.groupby('leaning').apply(lambda x : x.sample(n=n_samples, replace=True)).reset_index(drop=True)


Unnamed: 0,url,headline,datePublished_site,probability,date_collected,siteName,NE,embedding,leaning
0,https://thehill.com/homenews/campaign/4434686-...,Haley dismisses SC endorsements for Trump: ‘Yo...,,0.730285,2024-01-28,TheHill,"[Haley, SC, Trump]","[0.023159081, -0.045888543, -0.021292929, -0.0...",center
1,https://thehill.com/homenews/campaign/4418233-...,Haley calls out Trump on false claims about Ne...,2024-01-19T14:35:00-05:00,0.983203,2024-01-19,TheHill,"[Haley, Trump, New Hampshire]","[0.01650341, -0.026753427, -0.018798003, -0.02...",center
2,https://thehill.com/homenews/campaign/4390431-...,DeSantis appears to jab Haley by giving CNN an...,2024-01-05T02:57:16.149264,0.666339,2024-01-04,TheHill,"[DeSanti, Haley, CNN, Iowa]","[-0.0051982147, -0.019941136, -0.026498051, -0...",center
3,https://thehill.com/homenews/campaign/4435118-...,Haley defends verdict against Trump in Carroll...,2024-01-29T07:35:00-05:00,0.983575,2024-01-29,TheHill,"[Haley, Trump, Carroll]","[0.040384267, -0.023268428, -0.02491418, -0.00...",center
4,,Donald Trump and Nikki Haley's Chances in Sout...,2024-01-25T00:00:00,0.351000,2024-01-25,Newsweek,"[Donald Trump, Nikki Haley, Chance, South Caro...","[0.003954346, -0.046195626, -0.021976799, 0.00...",center
...,...,...,...,...,...,...,...,...,...
295,https://www.foxnews.com/media/desantis-wouldnt...,DeSantis says he wouldn't accept role as Nikki...,,0.944216,2024-01-06,FoxNews(OnlineNews),"[DeSantis, Nikki Haley]","[0.0087774, -0.0379692, -0.00937315, -0.006558...",right
296,https://www.foxnews.com/video/5614615980001,Nikki Haley swiftly loses home state in South ...,,0.966399,2024-02-24,FoxNews(OnlineNews),"[Nikki Haley, South Carolina]","[0.0041270955, -0.0515003, -0.0031943843, -0.0...",right
297,https://www.breitbart.com/politics/2024/01/02/...,Nikki Haley Tries to Bully Trump into Particip...,,0.944960,2024-01-02,BreitbartNews,"[Nikki Haley, Trump, CNN Debate]","[-0.010216404, -0.07572236, -0.017827192, -0.0...",right
298,https://www.dailymail.co.uk/news/article-12990...,'That is the sound of a two-person race': Nikk...,,0.351000,2024-01-22,DailyMail,"[Nikki Haley, Ron DeSantis, White House, New H...","[0.025603542, -0.062353633, -0.018717265, -0.0...",right


In [43]:
Haley_NEs = ["Haley", "Nikki Haley"]

df_Haley = filter_df_by_NE(df_word_embedding, Haley_NEs)
df_Haley = sample_n_from_group(df_Haley, 100)
calc_scale_leaning_similarity_df(df_Haley, df_antonyms).sort_values(by='scale_leaning_similarity', ascending=False).head(30)

  sampled_df = df.groupby('leaning').apply(lambda x : x.sample(n=n_samples, replace=True)).reset_index(drop=True)


Unnamed: 0,Left_adj,Right_adj,scale_leaning_similarity
452,unfortunate,fortunate,0.127046
841,undeclared,declared,0.126955
687,unfinished,finished,0.125646
1581,unobvious,obvious,0.120649
955,inglorious,glorious,0.116648
1692,unsolved,solved,0.107224
853,subordinate,dominant,0.10699
1797,sorrowful,joyful,0.106091
1535,stingy,generous,0.104472
1113,unfilled,filled,0.103055


In [44]:
Carroll_NEs = ["Carroll", "Jean Carroll"]

df_Carroll = filter_df_by_NE(df_word_embedding, Carroll_NEs)
df_Carroll = sample_n_from_group(df_Carroll, 100)
calc_scale_leaning_similarity_df(df_Carroll, df_antonyms).sort_values(by='scale_leaning_similarity', ascending=False).head(30)

  sampled_df = df.groupby('leaning').apply(lambda x : x.sample(n=n_samples, replace=True)).reset_index(drop=True)


Unnamed: 0,Left_adj,Right_adj,scale_leaning_similarity
885,unargumentative,argumentative,0.143411
1781,diffident,confident,0.141213
432,unexpansive,expansive,0.137762
579,noneffervescent,effervescent,0.121835
539,unexcited,excited,0.116494
891,unendowed,endowed,0.114275
889,provincial,cosmopolitan,0.113915
380,unenthusiastic,enthusiastic,0.113792
239,inconspicuous,conspicuous,0.111089
1490,unrigged,rigged,0.106709


In [36]:
df_Carroll_sample = []

for x in df_Carroll.groupby("leaning")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f8e992c34f0>

# Word Embedding Trend

In [147]:
df_dem

Unnamed: 0,url,headline,datePublished_site,probability,date_collected,siteName,NE,embedding,leaning
3,https://nypost.com/2024/01/23/news/hunter-bide...,Hunter Biden's 'sugar brother' side steps ques...,,0.922552,2024-01-23,NewYorkPost(News),"[Hunter Biden, Biden, Joe]","[0.043067176, 0.004102276, -0.020094428, 0.015...",right
6,https://nypost.com/2024/01/23/news/everything-...,Why Biden isn't on the ballot and everything e...,,0.800978,2024-01-23,NewYorkPost(News),"[Biden, NH]","[0.05203508, -0.05474995, -0.018947555, -0.004...",right
68,https://www.newsweek.com/fani-willis-allegatio...,Fani Willis Allegations Playing Right Into Don...,2024-01-23T15:59:33.385169,0.846905,2024-01-23,Newsweek,"[Fani Willis, Donald Trump]","[0.043295402, -0.041012503, -0.017214634, -0.0...",center
89,,"Biden, Harris stress abortion rights on Roe an...",,0.536566,2024-01-23,Reuters,"[Biden, Harris, Roe]","[-0.009574942, -0.03327966, -0.015366436, 0.00...",center
99,https://slate.com/news-and-politics/2024/01/bi...,Democrats Got Themselves a Big Ol’ Mess in New...,,0.547724,2024-01-23,Slate,"[Democrats, Ol, New Hampshire]","[0.022481, -0.01014155, -0.013974309, -0.00257...",left
...,...,...,...,...,...,...,...,...,...
267587,https://spectator.org/gen-z-pop-star-olivia-ro...,Gen Z Pop Star Olivia Rodrigo Shills for Democ...,,0.707844,2024-02-29,TheAmericanSpectator,"[Gen Z, Olivia Rodrigo Shills, Democrats]","[-0.029649565, -0.00136477, -0.024635501, 0.00...",right
267596,https://dailycaller.com/2024/02/29/hunter-bide...,Hunter Biden Only Remembers One Thing About Me...,,0.966366,2024-02-29,TheDailyCaller,[Hunter Biden],"[0.04360622, -0.010299826, -0.015516954, -0.00...",right
267601,https://dailycaller.com/2024/02/29/judge-jeani...,'So Let More People In?': 'The Five' Panel Eru...,,0.969685,2024-02-29,TheDailyCaller,"[Biden, Congress]","[0.0044886856, 0.0037715994, -0.012907557, 0.0...",right
267625,https://thehill.com/homenews/state-watch/44993...,Gavin Newsom says Panera not exempt from Calif...,,0.631573,2024-02-29,TheHill,"[Gavin Newsom, Panera, California]","[-0.07586138, -0.009234161, -0.0019145188, 0.0...",center


In [146]:
df_rep

Unnamed: 0,url,headline,datePublished_site,probability,date_collected,siteName,NE,embedding,leaning
35,https://nypost.com/2024/01/23/news/new-hampshi...,"Get the latest updates, polls as Trump and Hal...",,0.976848,2024-01-23,NewYorkPost(News),"[Trump, Haley]","[-0.008951688, -0.03357427, -0.014612635, -0.0...",right
48,https://www.nytimes.com/interactive/2024/01/23...,Delays to four criminal trials could pay off f...,2024-01-23T15:59:13.600087,0.700171,2024-01-23,NewYorkTimes(News),[Donald Trump],"[0.017039325, 0.0017712428, -0.022057582, 0.05...",left
66,https://www.newsnationnow.com/politics/2024-el...,"Trump vs. Haley: Stance on retirement, social ...",2024-01-23T15:05:14.871853,0.971077,2024-01-23,NewsNation,"[Trump, Haley, Stan]","[-0.015961787, -0.024678214, -0.015645575, 0.0...",center
68,https://www.newsweek.com/fani-willis-allegatio...,Fani Willis Allegations Playing Right Into Don...,2024-01-23T15:59:33.385169,0.846905,2024-01-23,Newsweek,"[Fani Willis, Donald Trump]","[0.043295402, -0.041012503, -0.017214634, -0.0...",center
69,https://www.newsweek.com/new-hampshire-trump-h...,How Nikki Haley Can Pull Off a New Hampshire U...,2024-01-23T15:56:33.389133,0.831250,2024-01-23,Newsweek,"[Nikki Haley, New Hampshireset]","[0.018094726, -0.061073165, -0.017928464, -0.0...",center
...,...,...,...,...,...,...,...,...,...
267621,https://thehill.com/homenews/campaign/4499352-...,Trump on abortion limits: ‘I’m hearing about 1...,2024-02-29T22:21:00-05:00,0.990374,2024-02-29,TheHill,[Trump],"[-0.033676043, 0.007807225, -0.014627249, -0.0...",center
267622,https://thehill.com/homenews/campaign/4499299-...,Trump says local police will be key to mass de...,2024-02-29T21:43:00-05:00,0.981977,2024-02-29,TheHill,[Trump],"[-0.015617153, -0.0158415, -0.0138472915, 0.01...",center
267623,https://thehill.com/homenews/campaign/4499397-...,Trump says Texas governor is on shortlist for VP,2024-02-29T22:48:00-05:00,0.989880,2024-02-29,TheHill,"[Trump, Texas, VP]","[0.016909795, 0.010269523, -0.0077965944, 0.01...",center
267627,https://thehill.com/homenews/campaign/4499199-...,Kari Lake says if Trump is reelected wars in U...,2024-03-01T02:07:15.169570,0.864331,2024-02-29,TheHill,"[Kari Lake, Trump, Ukraine, Gaza]","[0.0014189703, -0.051237423, -0.01288996, 0.01...",center


In [142]:
Trump_NEs = ["Trump", "Donald Trump"]
Biden_NEs = ["Biden", "Joe Biden"]

df_Biden = filter_df_by_NE(df_word_embedding, Biden_NEs)
df_Trump = filter_df_by_NE(df_word_embedding, Trump_NEs)

In [148]:
for group_name, group_df in df_dem.groupby("date_collected"):
    print(f"Group: {group_name}")
    print(calc_scale_leaning_similarity_df(group_df, df_antonyms).sort_values(by='scale_leaning_similarity', ascending=False).head(1))

Group: 2024-01-01 00:00:00
           adj_1     adj_2  scale_leaning_similarity
1716  immoderate  moderate                  0.136568
Group: 2024-01-02 00:00:00
         adj_1     adj_2  scale_leaning_similarity
439  high-rise  low-rise                  0.122075
Group: 2024-01-03 00:00:00
         adj_1        adj_2  scale_leaning_similarity
274  dignified  undignified                  0.146632
Group: 2024-01-04 00:00:00
      adj_1       adj_2  scale_leaning_similarity
27  apteral  peripteral                  0.129026
Group: 2024-01-05 00:00:00
         adj_1      adj_2  scale_leaning_similarity
979  exclusive  inclusive                  0.127896
Group: 2024-01-06 00:00:00
       adj_1   adj_2  scale_leaning_similarity
1674  gabled  hipped                  0.117429
Group: 2024-01-07 00:00:00
         adj_1      adj_2  scale_leaning_similarity
150  acceptive  rejective                  0.154029
Group: 2024-01-08 00:00:00
        adj_1   adj_2  scale_leaning_similarity
603  immodest  mod

In [136]:
df_Biden.groupby("date_collected")["leaning"].value_counts()

date_collected  leaning
2024-01-01      right       46
                left        24
                center      13
2024-01-02      right       43
                left        19
                          ... 
2024-02-28      right      115
                center      52
2024-02-29      right      141
                left       133
                center      55
Name: count, Length: 180, dtype: int64

In [144]:
# co-occurance sentiment?
# should only focus on major political figures or aggregated analysis  for longitudinal analysis

In [145]:
# combining word space with embedding space