In [1]:
import pandas as pd

# Load the Complaints dataset (replace with your file path)
complaints_df = pd.read_csv("complaints.csv")

# Show the first few rows to inspect the data
complaints_df.head()


Unnamed: 0,Category,Issue,Complaint
0,Water,No water supply,I want to report an issue regarding no water s...
1,Street Lights,Street light timer not set properly,I want to report an issue regarding street lig...
2,Street Lights,Flickering street lights,I want to report an issue regarding flickering...
3,Power,Frequent power cuts,I want to report an issue regarding frequent p...
4,Street Lights,Street lights too dim,I want to report an issue regarding street lig...


In [2]:
# Load the FAQs dataset (replace with your file path)
faqs_df = pd.read_csv("faqs_dataset.csv")

# Show the first few rows to inspect the data
faqs_df.head()


Unnamed: 0,Category,Question,Answer
0,Permissions,What documents are required for building permi...,Building permissions require property document...
1,Income Tax,What do I get my tax return status?,Log into your account on the tax e-filing offi...
2,Certificates,What can I update my name on a birth certificate?,"To update a name, submit a correction request ..."
3,Permissions,Can I apply for building permissions online?,"Yes, log in to the GHMC official portal, uploa..."
4,Certificates,What can I update my name on a birth certificate?,"To update a name, submit a correction request ..."


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text (remove stop words, tokenize, lemmatize)
def preprocess_text(text):
    # Tokenize the text
    words = word_tokenize(text.lower())  # Convert text to lower case
    # Remove stopwords and non-alphabetic words
    words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

# Apply preprocessing to 'Complaint' column
complaints_df['Processed_Complaint'] = complaints_df['Complaint'].apply(preprocess_text)

# Show the processed complaints
complaints_df[['Category', 'Issue', 'Processed_Complaint']].head()

Unnamed: 0,Category,Issue,Processed_Complaint
0,Water,No water supply,want report issue regarding water supply probl...
1,Street Lights,Street light timer not set properly,want report issue regarding street light timer...
2,Street Lights,Flickering street lights,want report issue regarding flickering street ...
3,Power,Frequent power cuts,want report issue regarding frequent power cut...
4,Street Lights,Street lights too dim,want report issue regarding street light dim p...


In [6]:
# Apply preprocessing to 'Question' and 'Answer' columns in the FAQs dataset
faqs_df['Processed_Question'] = faqs_df['Question'].apply(preprocess_text)
faqs_df['Processed_Answer'] = faqs_df['Answer'].apply(preprocess_text)

# Show the processed FAQs
faqs_df[['Category', 'Processed_Question', 'Processed_Answer']].head()


Unnamed: 0,Category,Processed_Question,Processed_Answer
0,Permissions,document required building permission,building permission require property document ...
1,Income Tax,get tax return status,log account tax official portal check tax retu...
2,Certificates,update name birth certificate,update name submit correction request ghmc web...
3,Permissions,apply building permission online,yes log ghmc official portal upload document o...
4,Certificates,update name birth certificate,update name submit correction request ghmc web...


In [7]:
import pandas as pd

# Load the complaints dataset
complaints_df = pd.read_csv('complaints.csv')
faqs_df = pd.read_csv('faqs_dataset.csv')

# Display the first few rows of each dataset
print("Complaints Dataset Sample:")
print(complaints_df.head())

print("\nFAQs Dataset Sample:")
print(faqs_df.head())


Complaints Dataset Sample:
        Category                                Issue  \
0          Water                      No water supply   
1  Street Lights  Street light timer not set properly   
2  Street Lights             Flickering street lights   
3          Power                  Frequent power cuts   
4  Street Lights                Street lights too dim   

                                           Complaint  
0  I want to report an issue regarding no water s...  
1  I want to report an issue regarding street lig...  
2  I want to report an issue regarding flickering...  
3  I want to report an issue regarding frequent p...  
4  I want to report an issue regarding street lig...  

FAQs Dataset Sample:
       Category                                           Question  \
0   Permissions  What documents are required for building permi...   
1    Income Tax                What do I get my tax return status?   
2  Certificates  What can I update my name on a birth certificate?  

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download nltk data
nltk.download('punkt')
nltk.download('stopwords')

# Preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text
    words = word_tokenize(text)
    
    # Remove punctuation and stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]
    
    # Join words back into a single string
    return ' '.join(words)

# Apply preprocessing to complaints and FAQs
complaints_df['Complaint'] = complaints_df['Complaint'].apply(preprocess_text)
faqs_df['Question'] = faqs_df['Question'].apply(preprocess_text)
faqs_df['Answer'] = faqs_df['Answer'].apply(preprocess_text)

# Verify the changes
print("\nPreprocessed Complaints:")
print(complaints_df.head())

print("\nPreprocessed FAQs:")
print(faqs_df.head())



Preprocessed Complaints:
        Category                                Issue  \
0          Water                      No water supply   
1  Street Lights  Street light timer not set properly   
2  Street Lights             Flickering street lights   
3          Power                  Frequent power cuts   
4  Street Lights                Street lights too dim   

                                           Complaint  
0  want report issue regarding water supply probl...  
1  want report issue regarding street light timer...  
2  want report issue regarding flickering street ...  
3  want report issue regarding frequent power cut...  
4  want report issue regarding street lights dim ...  

Preprocessed FAQs:
       Category                                Question  \
0   Permissions  documents required building permission   
1    Income Tax                   get tax return status   
2  Certificates           update name birth certificate   
3   Permissions       apply building permissi

[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import numpy as np

# Load Datasets
complaints_df = pd.read_csv('complaints.csv')
faqs_df = pd.read_csv('faqs_dataset.csv')

# Combine both datasets for a unified vectorizer
combined_data = pd.concat([complaints_df[['Complaint']].rename(columns={'Complaint': 'Text'}),
                           faqs_df[['Question']].rename(columns={'Question': 'Text'})])
combined_data['Category'] = ['Complaint'] * len(complaints_df) + ['FAQ'] * len(faqs_df)

# Preprocess Text Function
def preprocess_text(text):
    return text.lower()

# Step 1: Create a Single TF-IDF Vectorizer for both Complaints and FAQs
vectorizer = TfidfVectorizer()
vectorizer.fit(combined_data['Text'])

# Transform complaints and FAQs data using the same vectorizer
complaints_tfidf = vectorizer.transform(complaints_df['Complaint'])
faqs_tfidf = vectorizer.transform(faqs_df['Question'])

# Step 2: Train a Naive Bayes Classifier for Category Detection
X = vectorizer.transform(combined_data['Text'])
y = combined_data['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Function for Finding the Most Similar FAQ Answer
def find_most_similar_faq(user_input):
    user_input_tfidf = vectorizer.transform([user_input])
    similarities = cosine_similarity(user_input_tfidf, faqs_tfidf)
    best_match_index = similarities.argmax()
    return faqs_df.iloc[best_match_index]['Answer']

# Improved Complaint Handling
complaint_mode = False

def generate_complaint_id():
    return f"GHMC{random.randint(1000, 9999)}"

def chatbot_response(user_input):
    global complaint_mode
    
    # Preprocess the input
    user_input = preprocess_text(user_input)

    # Check if we are in the middle of complaint mode
    if complaint_mode:
        complaint_mode = False
        complaint_id = generate_complaint_id()
        return (f"Thank you for providing the details. Your complaint has been filed with ID: {complaint_id}.\n"
                "You can track it on the GHMC website.\nHow can I help you?")

    # Get prediction probabilities for both categories
    input_vector = vectorizer.transform([user_input])
    prob = classifier.predict_proba(input_vector)[0]

    # Define a threshold for minimum confidence for prediction
    threshold = 0.7  # For example, consider the prediction valid only if its probability is over 70%

    # Classify the user input based on the prediction probabilities
    if max(prob) < threshold:
        return f"I'm sorry, I can't help with that.\nHow can I help you?"

    # Classify the user input
    category = classifier.predict(input_vector)[0]

    # If it's a Complaint
    if category == 'Complaint':
        complaint_mode = True
        return "Please provide further details about your issue."

    # If it's an FAQ
    elif category == 'FAQ':
        response = find_most_similar_faq(user_input)
        return f"{response}\nHow can I help you?"

    # If it's neither
    else:
        return f"I'm sorry, I can't help with that.\nHow can I help you?"

# Step 5: Interaction Loop
print("Welcome to GHMC Support Chatbot! Type 'exit' to quit.")
print("How can I help you?")  # This will be printed initially
while True:
    user_input = input("You: ")
    
    if user_input.lower() == 'exit':
        print("Thank you for using GHMC Support Chatbot!")
        break
    
    response = chatbot_response(user_input)
    print("Chatbot:", response)


Welcome to GHMC Support Chatbot! Type 'exit' to quit.
How can I help you?


You:  how to register birth certifacte


Chatbot: Visit the GHMC website or local office, provide ID documents, and complete the birth certificate application form.
How can I help you?


You:  patholes on road


Chatbot: Please provide further details about your issue.


You:  k


Chatbot: Thank you for providing the details. Your complaint has been filed with ID: GHMC9289.
You can track it on the GHMC website.
How can I help you?


You:  rate a movie salaar


Chatbot: I'm sorry, I can't help with that.
How can I help you?
