In [1]:
import numpy as np
import nltk 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import spacy
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet

In [2]:
# data

nltk_raw_data = pd.read_json("nlp_qa_corpus.json")

nlp = spacy.load("en_core_web_sm")

In [3]:
# stopword removal
def remove_stopwords(text, language="english"):
    stop_words = set(stopwords.words(language))
    word_tokens = word_tokenize(text) 
    filtere_sentence = [w for w in word_tokens if not w.lower() in stop_words and w != "?"] 

    return ' '.join(filtere_sentence)

# data = [remove_stopwords(x) for x in qa_pairs.keys()]
def lemmatize_text(text):
    doc = nlp(text)
    lemma_txt = [x.lemma_ for x in doc]
    return ' '.join(lemma_txt)

# Lemmatizer and helper function for POS tags
lemmatizer = WordNetLemmatizer()

# Map POS tags from nltk to wordnet format
def get_wordnet_pos(nltk_pos_tag):
    if nltk_pos_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_pos_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_pos_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

def lemmatize_text_nltk(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    lemmas = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in pos_tags
    ]
    return ' '.join(lemmas)

# data["question"]
data = [remove_stopwords(x) for x in nltk_raw_data['question']]

data = [lemmatize_text_nltk(x) for x in data]
data

['question answer NLP',
 'Natural Language Generation ( NLG )',
 'dependency parse',
 'TF-IDF',
 'corpus NLP',
 'machine translation',
 'semantic analysis',
 'n-gram',
 'knowledge graph',
 'vector space model',
 'BLEU score',
 'semantic network',
 'dialogue system',
 'regular expression NLP',
 'name entity recognition ( NER )',
 'chatbot',
 'GPT',
 'sentiment analysis',
 'knowledge graph',
 'language model evaluation',
 'GPT',
 'corpus NLP',
 'TF-IDF',
 'GPT',
 'n-gram',
 'ROUGE score',
 'distributional semantics',
 'topic modeling',
 'paraphrase',
 'knowledge graph',
 'Natural Language Processing ( NLP )',
 'few-shot learning NLP',
 'zero-shot learning NLP',
 'transfer learn NLP',
 'knowledge graph',
 'text generation',
 'text summarization',
 'context window',
 'language model evaluation',
 'word embeddings',
 'Natural Language Understanding ( NLU )',
 'word sense disambiguation',
 'question answer NLP',
 'n-gram',
 'NLP',
 'name entity recognition ( NER )',
 'n-gram',
 'zero-shot le

In [4]:
vectorizer =  TfidfVectorizer()
questions = data
answers = nltk_raw_data['answer']


In [None]:
tdidf_matrix = vectorizer.fit_transform(questions)
def answer_question(question):
    question = remove_stopwords(question)
    user_vector = vectorizer.transform([question])
    
    similarities = cosine_similarity(user_vector, tdidf_matrix)
    
    best_match_id = np.argmax(similarities)
    if similarities[0, best_match_id] > 0.2:
        return answers[best_match_id]
    else:
        return "I do not know the answer to that question"

In [8]:
answer_question("POS")

'POS tagging is the process of assigning grammatical tags (e.g., noun, verb, adjective) to each word in a text.'