# News Filter

## Imports

In [12]:
import pandas as pd
import numpy as np

# Load and Preprocess data

In [13]:
path_to_climate_data = "../data/processed/climateWithEmbeds.parquet"
path_to_news_data = "../data/processed/abcNewsWithEmbeds.parquet"

# read headlines from parquet file
climate_data = pd.read_parquet(path_to_climate_data)
news_data = pd.read_parquet(path_to_news_data)

# data schema: [id, date, headline, url, embedding]

# fetch embeddings
climate_embeddings = climate_data.iloc[:, -1]
news_embeddings = news_data.iloc[:, -1]

In [38]:
# print lenght of news and climate data
len(news_data)

1243598

## Create Filtering Layer

In [44]:
# calculate midpoint of climate news embeddings
midpoint = climate_embeddings.mean(axis=0)

# calculate cosine similarity between news embeddings and midpoint
news_cosine_similarity = news_embeddings.apply(
    lambda x: x.dot(midpoint) / (np.linalg.norm(x) * np.linalg.norm(midpoint))
)

news_data["cosine_similarity"] = news_cosine_similarity
news_data = news_data.sort_values(by="cosine_similarity", ascending=False)

# filter news data by cosine similarity
percent_threshold = 0.9

filtered_news_data = news_data[news_data["cosine_similarity"] > percent_threshold]

In [47]:
len(filtered_news_data) / len(news_data)

0.060099807172414234

In [48]:
# store filtered news data to parquet file
filtered_news_data.to_parquet("../data/processed/filteredAbcNewsWithEmbeds.parquet")