# Endpoint to fetch knn articles based on NLP query

## Creating testing functionality

In [83]:
# load climate data
import pandas as pd

# load data
test_data = pd.read_parquet("../data/processed/climateWithEmbeds.parquet").head(20)

# date, url, headline, embedding

# create a new dataframe with a new column with labels. label a if the headline contains the word 'climate' and b otherwise
test_data["label"] = test_data["headline"].apply(
    lambda x: "a" if "Climate Change" in x else "b"
)

# create test data with a and b
test_data_a = test_data[test_data["label"] == "a"]
test_data_b = test_data[test_data["label"] == "b"]


# fetch the embedding column
embeddings = test_data["embedding"].values.tolist()

# fetch embeddings for a and b
embeddings_a = test_data_a["embedding"].values.tolist()
embeddings_b = test_data_b["embedding"].values.tolist()
# create new knn model with the embedding column as the feature column
from sklearn.neighbors import NearestNeighbors

nbrs_general = NearestNeighbors(n_neighbors=20, algorithm="kd_tree").fit(embeddings)

nbrs_a = NearestNeighbors(n_neighbors=3, algorithm="kd_tree").fit(embeddings_a)

nbrs_b = NearestNeighbors(n_neighbors=3, algorithm="kd_tree").fit(embeddings_b)

In [84]:
test_data.iloc[5]

date                                       2020-07-02 03:48:26
url          https://www.nytimes.com/2020/07/01/climate/tru...
headline     Inquiry Prompted by Trump’s Hurricane Dorian C...
embedding    [0.5141559839248657, -0.18467377126216888, -0....
label                                                        b
Name: 5, dtype: object

In [85]:
distances, indices = nbrs_general.kneighbors([embeddings[5]])

test_data.iloc[indices[0]]

Unnamed: 0,date,url,headline,embedding,label
5,2020-07-02 03:48:26,https://www.nytimes.com/2020/07/01/climate/tru...,Inquiry Prompted by Trump’s Hurricane Dorian C...,"[0.5141559839248657, -0.18467377126216888, -0....",b
3,2021-11-15 19:51:12,https://www.nytimes.com/2021/11/15/climate/gen...,"In the Fight Against Climate Change, Young Voi...","[0.029055556282401085, -0.034522220492362976, ...",a
2,2019-04-12 09:00:08,https://www.nytimes.com/2019/04/12/books/revie...,Two New Books Dramatically Capture the Climate...,"[0.403890460729599, 0.19466587901115417, 0.065...",a
9,2018-01-23 14:36:36,https://www.nytimes.com/2018/01/23/business/ec...,Fighting Climate Change? We’re Not Even Landin...,"[0.4025600254535675, -0.2381201535463333, -0.0...",a
12,2020-06-20 20:00:18,https://www.nytimes.com/2020/06/20/us/poor-peo...,"From Policing to Climate Change, a Sweeping Ca...","[0.2741101086139679, 0.07526188343763351, -0.2...",a
17,2019-12-20 13:08:19,https://www.nytimes.com/2019/12/20/climate/net...,"In ‘Strongest’ Climate Ruling Yet, Dutch Court...","[0.21071168780326843, -0.259256511926651, -0.2...",b
7,2019-05-30 10:00:06,https://www.nytimes.com/2019/05/30/opinion/cli...,"Opinion | To Make Headway on Climate Change, L...","[0.0370464101433754, 0.2117423117160797, -0.09...",a
11,2021-09-07 20:25:20,https://www.nytimes.com/2021/09/07/climate/cli...,Medical Journals Call Climate Change the ‘Grea...,"[0.23736611008644104, 0.32593369483947754, 0.0...",a
10,2021-06-10 13:01:59,https://www.nytimes.com/2021/06/10/climate/bio...,Our Response to Climate Change Is Missing Some...,"[0.5517584085464478, 0.12411408126354218, 0.12...",a
16,2021-10-07 20:53:57,https://www.nytimes.com/2021/10/07/technology/...,Google Stops Serving Ads on Climate Change Mis...,"[0.2569691240787506, -0.06351375579833984, 0.1...",a


In [86]:
distances, indices = nbrs_a.kneighbors([embeddings[5]])

# get the nearest neighbors
test_data_a.iloc[indices[0]]

Unnamed: 0,date,url,headline,embedding,label
3,2021-11-15 19:51:12,https://www.nytimes.com/2021/11/15/climate/gen...,"In the Fight Against Climate Change, Young Voi...","[0.029055556282401085, -0.034522220492362976, ...",a
2,2019-04-12 09:00:08,https://www.nytimes.com/2019/04/12/books/revie...,Two New Books Dramatically Capture the Climate...,"[0.403890460729599, 0.19466587901115417, 0.065...",a
9,2018-01-23 14:36:36,https://www.nytimes.com/2018/01/23/business/ec...,Fighting Climate Change? We’re Not Even Landin...,"[0.4025600254535675, -0.2381201535463333, -0.0...",a


In [87]:
distances, indices = nbrs_b.kneighbors([embeddings[5]])

# get the nearest neighbors
test_data_b.iloc[indices[0]]

Unnamed: 0,date,url,headline,embedding,label
5,2020-07-02 03:48:26,https://www.nytimes.com/2020/07/01/climate/tru...,Inquiry Prompted by Trump’s Hurricane Dorian C...,"[0.5141559839248657, -0.18467377126216888, -0....",b
17,2019-12-20 13:08:19,https://www.nytimes.com/2019/12/20/climate/net...,"In ‘Strongest’ Climate Ruling Yet, Dutch Court...","[0.21071168780326843, -0.259256511926651, -0.2...",b
19,2021-12-13 22:31:24,https://www.nytimes.com/interactive/2021/12/13...,The Antarctic Is Signaling Big Climate Trouble.,"[0.23168832063674927, 0.456959992647171, -0.04...",b


## Creating Endpoint

In [88]:
def find_knn(knn_model, input_embedding, n_neighbors, filter=None):
    """Parameters:
    knn_model: the knn model. This should match the filter or will return no relevant results
    input_embedding: the embedding to find the nearest neighbors for. Must be preprocessed with same method as the knn model
    filter: the filter to apply to the data set. If None, no filter is applied.

    Returns:
    A dataframe with the nearest neighbors
    """
    data_set = test_data

    if filter is not None:
        # filter the dataframe
        filtered_df = test_data[test_data["label"] == filter]
        # get the embeddings
        data_set = filtered_df

    # fetch the knn model and get the nearest neighbors
    _, indices = knn_model.kneighbors([input_embedding], n_neighbors=n_neighbors)

    # get the nearest neighbors from the data set
    return data_set.iloc[indices[0]]

## Testing Endpoint

In [89]:
find_knn(nbrs_general, embeddings[5], 3)

Unnamed: 0,date,url,headline,embedding,label
5,2020-07-02 03:48:26,https://www.nytimes.com/2020/07/01/climate/tru...,Inquiry Prompted by Trump’s Hurricane Dorian C...,"[0.5141559839248657, -0.18467377126216888, -0....",b
3,2021-11-15 19:51:12,https://www.nytimes.com/2021/11/15/climate/gen...,"In the Fight Against Climate Change, Young Voi...","[0.029055556282401085, -0.034522220492362976, ...",a
2,2019-04-12 09:00:08,https://www.nytimes.com/2019/04/12/books/revie...,Two New Books Dramatically Capture the Climate...,"[0.403890460729599, 0.19466587901115417, 0.065...",a


In [90]:
find_knn(nbrs_a, embeddings[5], 3, "a")

Unnamed: 0,date,url,headline,embedding,label
3,2021-11-15 19:51:12,https://www.nytimes.com/2021/11/15/climate/gen...,"In the Fight Against Climate Change, Young Voi...","[0.029055556282401085, -0.034522220492362976, ...",a
2,2019-04-12 09:00:08,https://www.nytimes.com/2019/04/12/books/revie...,Two New Books Dramatically Capture the Climate...,"[0.403890460729599, 0.19466587901115417, 0.065...",a
9,2018-01-23 14:36:36,https://www.nytimes.com/2018/01/23/business/ec...,Fighting Climate Change? We’re Not Even Landin...,"[0.4025600254535675, -0.2381201535463333, -0.0...",a


In [91]:
find_knn(nbrs_b, embeddings[5], 3, "b")

Unnamed: 0,date,url,headline,embedding,label
5,2020-07-02 03:48:26,https://www.nytimes.com/2020/07/01/climate/tru...,Inquiry Prompted by Trump’s Hurricane Dorian C...,"[0.5141559839248657, -0.18467377126216888, -0....",b
17,2019-12-20 13:08:19,https://www.nytimes.com/2019/12/20/climate/net...,"In ‘Strongest’ Climate Ruling Yet, Dutch Court...","[0.21071168780326843, -0.259256511926651, -0.2...",b
19,2021-12-13 22:31:24,https://www.nytimes.com/interactive/2021/12/13...,The Antarctic Is Signaling Big Climate Trouble.,"[0.23168832063674927, 0.456959992647171, -0.04...",b
