# KNN for looking up relevant news articles

## Imports

In [226]:
import numpy as np
import pandas as pd

## Load and Preprocess data

In [227]:
path_to_data = "../data/intermediate/headlines_with_embeddings.parquet"

# read headlines from parquet file
data = pd.read_parquet(path_to_data)

In [228]:
# fetch headlines and their embeddings

headlines = data.iloc[:, 0]
embeddings = data.iloc[:, -1]

# convert embeddings to numpy array
headlines = np.array(headlines.tolist())
embeddings = np.array(embeddings.tolist())

## KNN

In [229]:
from sklearn.neighbors import NearestNeighbors

In [230]:
# find 10 nearest non-self neighbors to each headline
nbrs = NearestNeighbors(n_neighbors=11, algorithm="kd_tree").fit(embeddings)
distances, indices = nbrs.kneighbors(embeddings)

## Test functionality

In [231]:
test_headline = headlines[-1], embeddings[-1]

# find the nearest neighbor to the test headline
distances, indices = nbrs.kneighbors([test_headline[1]])

# print the test headline
print(test_headline[0], "\n ------------------ \n")

# print the nearest neighbors
for i in indices[0]:
    print(headlines[i]) if i != indices[0][0] else None

Opinion | Playing Dumb on Climate Change 
 ------------------ 

Opinion | Climate Change in Trump’s Age of Ignorance
Opinion | To Make Headway on Climate Change, Let’s Change the Subject
Opinion | Climate Change and the Exodus of Species
Opinion | Can Geoegineering Fix Climate Change?
Opinion | China and India Make Big Strides on Climate Change
Losing Earth: The Decade We Almost Stopped Climate Change
In the Fight Against Climate Change, Young Voices Speak Out
Our Response to Climate Change Is Missing Something Big, Scientists Say
Industry Awakens to Threat of Climate Change
The Effects of Climate Change


## Function call to return top 10 non-self headlines for input embedding

In [232]:
def fetch_KNN(input_headline, input_embedding):
    # find the nearest neighbor to the test headline
    distances, indices = nbrs.kneighbors([input_embedding])

    # return the test headline and the nearest neighbors as json
    return {
        "headline": input_headline,
        "neighbors": [
            {
                "headline": headlines[i],
                "distance": distances[0][j],
                "url": data.iloc[i][1],
                "date": data.iloc[i][2],
            }
            for j, i in enumerate(indices[0])
            if i != len(headlines) - 1
        ],
    }

In [233]:
fetch_KNN(test_headline[0], test_headline[1])

{'headline': 'Opinion | Playing Dumb on Climate Change',
 'neighbors': [{'headline': 'Opinion | Climate Change in Trump’s Age of Ignorance',
   'distance': 0.4608647524748748,
   'url': 'https://www.nytimes.com/2016/11/20/opinion/climate-change-in-trumps-age-of-ignorance.html',
   'date': '2016-11-19 20:44:05'},
  {'headline': 'Opinion | To Make Headway on Climate Change, Let’s Change the Subject',
   'distance': 0.4948449900578245,
   'url': 'https://www.nytimes.com/2019/05/30/opinion/climate-elections-democrats.html',
   'date': '2019-05-30 10:00:06'},
  {'headline': 'Opinion | Climate Change and the Exodus of Species',
   'distance': 0.5306008554761565,
   'url': 'https://www.nytimes.com/2011/09/27/opinion/climate-change-and-the-exodus-of-species.html',
   'date': '2011-09-27 01:35:12'},
  {'headline': 'Opinion | Can Geoegineering Fix Climate Change?',
   'distance': 0.5325209060551198,
   'url': 'https://www.nytimes.com/2021/10/01/opinion/climate-change-geoengineering.html',
   'da