# News Filter

## Imports

In [40]:
import pandas as pd
import numpy as np

# Load and Preprocess data

In [41]:
path_to_climate_data = "../data/intermediate/headlines_with_embeddings.parquet"
path_to_news_data = "../data/processed/abcnewsFinal-100-v0.paraquet"

# read headlines from parquet file
climate_data = pd.read_parquet(path_to_climate_data)
news_data = pd.read_parquet(path_to_news_data)

# fetch embeddings
climate_embeddings = climate_data.iloc[:, -1]
news_embeddings = news_data.iloc[:, 0]

## Create Filtering Layer

In [42]:
# calculate midpoint of climate news embeddings
midpoint = climate_embeddings.mean(axis=0)

# calculate cosine similarity between news embeddings and midpoint
news_cosine_similarity = news_embeddings.apply(
    lambda x: x.dot(midpoint) / (np.linalg.norm(x) * np.linalg.norm(midpoint))
)

news_data["cosine_similarity"] = news_cosine_similarity
news_data = news_data.sort_values(by="cosine_similarity", ascending=False)

# filter news data by cosine similarity
percent_threshold = 0.80

filtered_news_data = news_data[news_data["cosine_similarity"] > percent_threshold]

# save filtered news data to parquet file
filtered_news_data.to_parquet("../data/processed/filtered_news_data.parquet")

In [43]:
news_data.head(10)

Unnamed: 0,embedding,headline,cosine_similarity
15,"[-0.02671368420124054, -0.022069035097956657, ...",blizzard buries united states in bills,0.848569
2,"[-0.011609194800257683, -0.013824152760207653,...",a g calls for infrastructure protection summit,0.843296
78,"[0.001269091502763331, -0.027443889528512955, ...",irrigators vote over river management,0.841051
94,"[-0.013447894714772701, -0.029396116733551025,...",mayor warns landfill protesters,0.841018
86,"[-0.024383384734392166, -0.018253330141305923,...",low demand forces air service cuts,0.839814
34,"[-0.013010312803089619, -0.03315261751413345, ...",dargo fire threat expected to rise,0.839586
20,"[-0.027825141325592995, -0.02329331636428833, ...",businesses should prepare for terrorist attacks,0.839297
27,"[0.005911706481128931, -0.005662757437676191, ...",community urged to help homeless youth,0.835336
33,"[-0.0019284780137240887, -0.006976149044930935...",crean tells alp leadership critics to shut up,0.834085
14,"[-0.0056263599544763565, -0.000265402719378471...",big plan to boost paroo water supplies,0.833047


In [44]:
news_data.tail(10)

Unnamed: 0,embedding,headline,cosine_similarity
54,"[0.018580356612801552, -0.010553411208093166, ...",funds to go to cadell upgrade,0.778914
42,"[-0.0076059908606112, 0.002738699084147811, -0...",dog mauls 18 month old toddler in nsw,0.778043
57,"[0.009905742481350899, -0.0002725137455854565,...",gilchrist backs rest policy,0.77656
81,"[-0.009763807989656925, -0.01952761597931385, ...",juvenile sex offenders unlikely to reoffend as,0.776011
21,"[-0.018181266263127327, 0.00452771270647645, -...",calleri avenges final defeat to eliminate massu,0.775672
67,"[-0.012805283069610596, 0.001296333153732121, ...",hanson should go back where she came from nsw mp,0.775463
44,"[-0.0004975227639079094, 0.005590331275016069,...",england change three for wales match,0.768009
7,"[-0.034662555903196335, 0.0029939073137938976,...",aussie qualifier stosur wastes four memphis match,0.765872
11,"[-0.011968997307121754, -0.014409145340323448,...",barca take record as robson celebrates birthda...,0.765646
26,"[-0.04058630391955376, 0.009056813083589077, 0...",commonwealth bank cuts fixed home loan rates,0.759672
