# Creating 5 different KNN models (general, climateDB, Nature, ABC, NewsAPI)

## Imports

In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

## Load Data

In [2]:
total_data = pd.read_parquet("../data/processed/news-consolidated-v2.parquet")

total_data.head()

Unnamed: 0,id,source,date,headline,embedding,url,sentiment_score,sentiment_label
0,0,abc,2003-02-19,a g calls for infrastructure protection summit,"[0.42550426721572876, 0.5782315135002136, 0.09...",,0.766538,neutral
1,1,abc,2003-02-19,epa still trying to recover chemical clean up ...,"[0.33238619565963745, -0.3517177700996399, 0.5...",,0.56363,negative
2,2,abc,2003-02-19,expressions of interest sought to build livestock,"[0.4847770035266876, 0.10000099241733551, -0.0...",,0.843926,neutral
3,3,abc,2003-02-19,iraq to pay for own rebuilding white house,"[0.4847399592399597, 0.20435450971126556, 0.19...",,0.762468,neutral
4,4,abc,2003-02-19,meeting to focus on broken hill water woes,"[0.3507457375526428, 0.43837735056877136, -0.0...",,0.720201,neutral


## Creating KNN Models

In [3]:
nbrs_general = NearestNeighbors(n_neighbors=20, algorithm="kd_tree").fit(
    total_data["embedding"].values.tolist()
)

In [11]:
nbrs_climateDB = NearestNeighbors(n_neighbors=20, algorithm="kd_tree").fit(
    total_data[total_data["source"] == "climate-db"]["embedding"].values.tolist()
)
nbrs_abcNews = NearestNeighbors(n_neighbors=20, algorithm="kd_tree").fit(
    total_data[total_data["source"] == "abc"]["embedding"].values.tolist()
)
nbrs_natureNews = NearestNeighbors(n_neighbors=20, algorithm="kd_tree").fit(
    total_data[total_data["source"] == "nature"]["embedding"].values.tolist()
)
nbrs_newsapi = NearestNeighbors(n_neighbors=20, algorithm="kd_tree").fit(
    total_data[total_data["source"] == "news-api"]["embedding"].values.tolist()
)

In [12]:
nbrs_natureNews.kneighbors([total_data["embedding"].iloc[5]])

(array([[1.23071053, 1.23568965, 1.24138558, 1.25182769, 1.25409491,
         1.25728125, 1.27589656, 1.27704022, 1.28068998, 1.2916176 ,
         1.2925867 , 1.29642266, 1.29649175, 1.30634988, 1.31094491,
         1.31439346, 1.31448637, 1.32133175, 1.32578274, 1.32972157]]),
 array([[ 566, 1475, 2870, 4435, 5108, 2927,  476, 3293, 5923, 5627, 3495,
         5183, 2655, 5543, 5813, 3556, 5689, 5401, 4786, 6554]]))

In [15]:
data_set = total_data[total_data["source"] == "nature"]

data_set.iloc[
    nbrs_natureNews.kneighbors([total_data["embedding"].iloc[5]])[1][0]
].head()

Unnamed: 0,id,source,date,headline,embedding,url,sentiment_score,sentiment_label
84181,84181,nature,2014-10-05,Deep-ocean contribution to sea level and energ...,"[0.3001623749732971, 0.2106800228357315, 0.180...",https://www.nature.com/articles/nclimate2387,0.833232,neutral
85090,85090,nature,2017-12-19,Change in Land Use and Evapotranspiration in t...,"[0.4011196196079254, 0.209755077958107, -0.020...",https://www.nature.com/articles/s41598-017-180...,0.918879,neutral
86485,86485,nature,2020-09-09,Cropland expansion in the United States produc...,"[0.13177421689033508, 0.2336922138929367, 0.12...",https://www.nature.com/articles/s41467-020-180...,0.723645,negative
88050,88050,nature,2022-04-06,The conterminous United States are projected t...,"[0.2786351144313812, -0.06172405928373337, 0.3...",https://www.nature.com/articles/s43247-022-004...,0.65603,negative
88723,88723,nature,2022-08-03,Proximity to small-scale inland and coastal fi...,"[0.3552359938621521, 0.06650374084711075, -0.1...",https://www.nature.com/articles/s43247-022-004...,0.735235,positive


## Save Models

In [16]:
# pickle the models
import pickle

with open("../data/models/knn-general.pkl", "wb") as f:
    pickle.dump(nbrs_general, f)

with open("../data/models/knn-climateDB.pkl", "wb") as f:
    pickle.dump(nbrs_climateDB, f)

with open("../data/models/knn-abcNews.pkl", "wb") as f:
    pickle.dump(nbrs_abcNews, f)

with open("../data/models/knn-natureNews.pkl", "wb") as f:
    pickle.dump(nbrs_natureNews, f)

with open("../data/models/knn-newsapi.pkl", "wb") as f:
    pickle.dump(nbrs_newsapi, f)