In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
lemmatizer = WordNetLemmatizer()

In [3]:
def lemma_me(sentance:str):
    sentance_tokens = nltk.word_tokenize(sentance.lower())
    pos_tags = nltk.pos_tag(sentance_tokens)
    sentance_lemmas = []
    for token,pos_tag in zip(sentance_tokens,pos_tags):
        if pos_tag[1][0].lower() in ['n','v','o','r']:
            lemma = lemmatizer.lemmatize(token,pos_tag[1][0].lower())
            sentance_lemmas.append(lemma)
    return sentance_lemmas

In [4]:
text = 'Originally, vegetables were collected from the wild by hunter-gatherers. Vegetables are all plants. Vegetables can be eaten either raw or cooked.'
question = 'What Are vegetables'
sentance_tokens = nltk.sent_tokenize(text)
sentance_tokens.append(question)

In [5]:
tv = TfidfVectorizer(tokenizer=lemma_me)

In [6]:
tf = tv.fit_transform(sentance_tokens)
tf

<4x8 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [7]:
tf.toarray()

array([[0.27717414, 0.53114624, 0.        , 0.        , 0.53114624,
        0.53114624, 0.        , 0.27717414],
       [0.41988018, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.8046125 , 0.41988018],
       [0.32713399, 0.        , 0.62688384, 0.62688384, 0.        ,
        0.        , 0.        , 0.32713399],
       [0.70710678, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678]])

In [8]:
value = cosine_similarity(tf[-1],tf[:-1])
value

array([[0.39198343, 0.59380024, 0.46263733]])

In [9]:
index = value.argsort()[0][-1]
index

1

In [10]:
values_flat = value.flatten()
values_flat


array([0.39198343, 0.59380024, 0.46263733])

In [11]:
values_flat.sort()
values_flat

array([0.39198343, 0.46263733, 0.59380024])

In [12]:
coeff = values_flat[-1]
coeff

0.593800244493221

In [13]:
if coeff > .3:
    print(sentance_tokens[index])
else:
    print("No matched sentance")

Vegetables are all plants.
