In [1]:
import nltk
nltk.download('punkt', quiet = True)
nltk.download('wordnet', quiet = True)
nltk.download('stopwords',quiet = True)
nltk.download('punkt_tab',quiet = True)

True

In [2]:
import nltk
import string  
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer  #Converts text into TF-IDF vectors
from sklearn.metrics.pairwise import cosine_similarity  #Computes similarity between user input and FAQs

In [3]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))  # Initialize stop words

In [4]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    words = word_tokenize(text)  # Tokenize text
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and word.isalpha()]  # Lemmatize and remove stopwords
    return " ".join(words)

In [5]:
def wrangle(filepath):
    df = pd.read_csv(filepath)
    
    #Drop unnecessary columns
    df = df.drop(columns = ["ID",
                        "Topic",
                       "Subtopic"
                      ])
    df["Cleaned_questions"] = df["Question"].apply(clean_text)
    
    return df

In [6]:
df = wrangle("C:/Users/HomePC/OneDrive/Desktop/chatbot/Tesco_ grocery_FAQ'S.csv")
df.head()

Unnamed: 0,Question,Answer,Cleaned_questions
0,Where Tesco delivers to,We deliver to most UK residential addresses. T...,tesco delivers
1,Delivery and Click+Collect prices,"The standard delivery charge is between £3–£7,...",delivery clickcollect price
2,Minimum order value,A £5 minimum basket charge will be added to de...,minimum order value
3,Returning an item,Please see our returns policy.,returning item
4,Slot times and options,You can choose to get your shopping delivered ...,slot time option


In [None]:
print(df.columns)

Index(['Question', 'Answer', 'Cleaned_questions'], dtype='object')


In [7]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

In [12]:
def get_response(user_query):
    """Find the most relevant FAQ answer based on cosine similarity."""
    user_query = user_query.lower()  # Normalize text
    
    # Greetings
    if user_query in ["hello", "hi"]:
        return "Hello! How can I assist you today?"
    if user_query in ["bye", "goodbye"]:
        return "Goodbye! Have a great day!"
    
    # Ensure tfidf_matrix is defined
    global tfidf_matrix
    if 'tfidf_matrix' not in globals():
        tfidf_matrix = vectorizer.fit_transform(df["Cleaned_questions"])
    
    user_query = clean_text(user_query)  # Clean user input
    user_tfidf = vectorizer.transform([user_query])  # Transform user input using TF-IDF
    similarities = cosine_similarity(user_tfidf, tfidf_matrix)  # Compute similarity
    best_match_idx = np.argmax(similarities)  # Get the index of the best match
    best_match_score = similarities[0, best_match_idx] 


    if best_match_score < 0.3:  # Adjust threshold as needed
        return "I'm sorry, I didn't understand. Can you rephrase?"
          
    return df.iloc[best_match_idx]["Answer"]  # Return the corresponding answer

In [13]:
# Fit TF-IDF vectorizer before entering loop
tfidf_matrix = vectorizer.fit_transform(df["Cleaned_questions"])

# Example usage
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Chatbot: Goodbye!")
        break
    print(f"Chatbot: {get_response(user_input)}")


Chatbot: Hello! How can I assist you today?
Chatbot: Hello! How can I assist you today?
Chatbot: I'm sorry, I didn't understand. Can you rephrase?
Chatbot: I'm sorry, I didn't understand. Can you rephrase?


KeyboardInterrupt: Interrupted by user