# Chatbot (Cosine Similarity)
Below is the code implementing the Cosine Based Similarity portion of the chatbot

In [19]:
import pandas as pd
df = pd.read_csv("test.csv")

In [20]:
questions_list = df['Questions'].tolist()

In [21]:
answers_list = df['Answers'].tolist()

#### Import statements below
nltk used for some nlp
sklearn used for machine learning/data analysis

In [22]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/ameen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ameen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ameen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import re

def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    text = re.sub(r'[^\w\s]', '', text)  # non alpha-numeric
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stopwords.words('english')]  # list comprehension to ONLY include tokens that are not stopwords!
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]
    return ' '.join(stemmed_tokens)

In [28]:
def preprocess_with_stopwords(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    text = re.sub(r'[^\w\s]', '', text)  # non alpha-numeric
    tokens = nltk.word_tokenize(text.lower())
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]
    return ' '.join(stemmed_tokens)

In [29]:
# vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize)
# X = vectorizer.fit_transform([preprocess(q) for q in questions_list])

import nltk
nltk.download("omw-1.4")


[nltk_data] Downloading package omw-1.4 to /Users/ameen/nltk_data...


True

In [30]:
vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize)
X = vectorizer.fit_transform([preprocess(q) for q in questions_list])

def get_response(text):
    processed_text = preprocess(text)
    print("processed_text:", processed_text)
    vectorized_text = vectorizer.transform([processed_text])
    similarities = cosine_similarity(vectorized_text, X)
    print("Similarities:", similarities)
    max_similarity = np.max(similarities)
    print("Max Similarity: ", max_similarity)
    if max_similarity > 0.6:
        high_similarity_questions = [q for q, s in zip(questions_list, similarities[0]) if s > 0.6]
        print("High Similarity Q's: ", high_similarity_questions)
        
        target_answers = []
        for q in high_similarity_questions:
            q_index = questions_list.index(q)
            target_answers.append(answers_list[q_index])
        print(target_answers)
        
        Z = vectorizer.fit_transform([preprocess_with_stopwords(q) for q in high_similarity_questions])
        processed_text_with_stopwords = preprocess_with_stopwords(text)
        print("Processed Text w Stopwords: ", processed_text_with_stopwords)
        vectorized_text_with_stopwords = vectorizer.transform([processed_text_with_stopwords])
        final_similarities = cosine_similarity(vectorized_text_with_stopwords, Z)
        closest = np.argmax(final_similarities)
        return target_answers[closest]
    else:
        return "I can't answer this question"
    
get_response('What is data analytics?')

processed_text: data analyt
Similarities: [[1.         0.32491362 0.39723569 0.3100033  0.3100033  0.50650236
  0.37086918 0.37086918]]
Max Similarity:  1.0000000000000002
High Similarity Q's:  ['What is data analytics?']
['Data analytics is the process of examining raw data to discover meaningful patterns, draw conclusions, and make informed decisions. It involves collecting, transforming, and analyzing large sets of data to extract valuable insights and support decision-making.']
Processed Text w Stopwords:  what is data analyt


'Data analytics is the process of examining raw data to discover meaningful patterns, draw conclusions, and make informed decisions. It involves collecting, transforming, and analyzing large sets of data to extract valuable insights and support decision-making.'