In [None]:
import pandas as pd

In [None]:
# https://scikit-learn.org/stable/datasets/real_world.html#the-20-newsgroups-text-dataset 
from sklearn.datasets import fetch_20newsgroups

bunch = fetch_20newsgroups(remove=('headers','filenames'))

print(type(bunch), bunch.keys())
# (sklearn.utils.Bunch, dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR']))

In [None]:
df = pd.DataFrame({"text":bunch.data,"target":bunch.target})
df["label"] = df.apply(lambda row: bunch.target_names[row["target"]], axis=1)
df["clean"] = df.apply(lambda row: ' '.join([word for word in row["text"].split() if word.isalnum()]), axis=1)

In [None]:
df.iloc[0]["text"]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(max_features=10_000,stop_words="english")
features = vec.fit_transform(df["clean"])

print(features.shape) # (11314, 10000)

In [None]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=5, metric='cosine',n_jobs=-1)
knn.fit(features)

In [None]:
input_texts = ["any recommendations for good ftp sites?", "i need to buy a new car"]
input_features = vec.transform(input_texts)

D, N = knn.kneighbors(input_features, n_neighbors=3, return_distance=True)

for input_text, distances, neighbors in zip(input_texts, D, N):
    print("Input text = ", input_text[:200], "\n")
    for dist, neighbor_idx in zip(distances, neighbors):
        print("Distance = ", dist, "Neighbor idx = ", neighbor_idx, "Label = ", \
              bunch.target_names[bunch.target[neighbor_idx]])
        print(bunch.data[neighbor_idx][:200])
        print("-"*100)
    print("="*100)
    print()

In [None]:
df.iloc[2529]["text"]