### Data Collection and Preprocessing

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

#### Step 1: Read and load the dataset

In [2]:
dataset_path = 'dataset/20000-Utterances-Training-dataset.csv' 
df = pd.read_csv(dataset_path)

#### Step 2: Explore and analyze the dataset

In [3]:
df.head()

Unnamed: 0,flags,utterance,category,intent
0,BILC,"I don't have an online account, what do I have...",ACCOUNT,create_account
1,BILQZ,can you tell me if i can regisger two accounts...,ACCOUNT,create_account
2,BPLC,"I have no online account, open one, please",ACCOUNT,create_account
3,BIPLD,"could you ask an agent how to open an account,...",ACCOUNT,create_account
4,BLQC,"i want an online account, create one",ACCOUNT,create_account


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21534 entries, 0 to 21533
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   flags      21534 non-null  object
 1   utterance  21534 non-null  object
 2   category   21534 non-null  object
 3   intent     21534 non-null  object
dtypes: object(4)
memory usage: 673.1+ KB


#### Step 3: Handle missing data and clean the text

In [5]:
# Function to clean the text by removing noise
def clean_text(text):
    # Remove any non-alphanumeric characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [6]:
# Apply the clean_text function to the 'utterance' column
df['utterance'] = df['utterance'].apply(clean_text)

#### Step 4: Preprocess the text

In [7]:
# Tokenization
nltk.download('punkt')  # Download the punkt tokenizer
df['tokenized_utterance'] = df['utterance'].apply(nltk.word_tokenize)

[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
# Stopword removal
nltk.download('stopwords')  # Download the stopwords
stop_words = set(stopwords.words('english'))
df['filtered_utterance'] = df['tokenized_utterance'].apply(lambda tokens: [token for token in tokens if token not in stop_words])

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Stemming or Lemmatization (choose one)
stemmer = PorterStemmer()
# lemmatizer = WordNetLemmatizer()

In [10]:
df['processed_utterance'] = df['filtered_utterance'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
# df['processed_utterance'] = df['filtered_utterance'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

#### Step 5: Encode linguistic flags

In [11]:
# Create a mapping for linguistic flags to numerical values
linguistic_flags_mapping = {
    'B': 1,
    'S': 2,
    'L': 3,
    'M': 4,
    'I': 5,
    'C': 6,
    'P': 7,
    'Q': 8,
    'W': 9,
    'E': 10,
    'D': 11,
    'Z': 12
}

In [12]:
# Function to encode linguistic flags
def encode_linguistic_flags(flags):
    return [linguistic_flags_mapping[flag] for flag in flags]

In [13]:
# Split the flags before encoding
df['flags'] = df['flags'].apply(list)

In [14]:
# Apply the encode_linguistic_flags function to the 'flags' column
df['encoded_flags'] = df['flags'].apply(encode_linguistic_flags)

In [15]:
# Print the preprocessed dataset
df.head()

Unnamed: 0,flags,utterance,category,intent,tokenized_utterance,filtered_utterance,processed_utterance,encoded_flags
0,"[B, I, L, C]",i dont have an online account what do i have t...,ACCOUNT,create_account,"[i, dont, have, an, online, account, what, do,...","[dont, online, account, register]","[dont, onlin, account, regist]","[1, 5, 3, 6]"
1,"[B, I, L, Q, Z]",can you tell me if i can regisger two accounts...,ACCOUNT,create_account,"[can, you, tell, me, if, i, can, regisger, two...","[tell, regisger, two, accounts, single, email,...","[tell, regisg, two, account, singl, email, add...","[1, 5, 3, 8, 12]"
2,"[B, P, L, C]",i have no online account open one please,ACCOUNT,create_account,"[i, have, no, online, account, open, one, please]","[online, account, open, one, please]","[onlin, account, open, one, pleas]","[1, 7, 3, 6]"
3,"[B, I, P, L, D]",could you ask an agent how to open an account ...,ACCOUNT,create_account,"[could, you, ask, an, agent, how, to, open, an...","[could, ask, agent, open, account, please]","[could, ask, agent, open, account, pleas]","[1, 5, 7, 3, 11]"
4,"[B, L, Q, C]",i want an online account create one,ACCOUNT,create_account,"[i, want, an, online, account, create, one]","[want, online, account, create, one]","[want, onlin, account, creat, one]","[1, 3, 8, 6]"


### Intent Recognition Model

In [16]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

#### Step 1: Split the dataset into training, validation, and test sets

In [17]:
X = df['processed_utterance']  # Features (processed utterances)
y = df['intent']  # Labels (intents)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Step 2: Convert processed utterances to numerical vectors using TF-IDF vectorizer

In [18]:
tfidf_vectorizer_intent = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer_intent.fit_transform(X_train.apply(lambda tokens: ' '.join(tokens)))
X_test_tfidf = tfidf_vectorizer_intent.transform(X_test.apply(lambda tokens: ' '.join(tokens)))

#### Step 3: Define the hyperparameter grid for SVM

In [19]:
param_dist = {
    'C': np.logspace(-3, 3, 7),  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
}

#### Step 4: Initialize the SVM classifier

In [20]:
svm_classifier = SVC()

#### Step 5: Hyperparameter Optimization using RandomizedSearchCV

In [21]:
random_search = RandomizedSearchCV(svm_classifier, param_distributions=param_dist, n_iter=10, cv=3, random_state=42)
random_search.fit(X_train_tfidf, y_train)

#### Step 6: Get the best hyperparameters and the best model

In [22]:
best_params = random_search.best_params_
best_model = random_search.best_estimator_

#### Step 7: Evaluate the best model on the test set

In [23]:
y_pred = best_model.predict(X_test_tfidf)

In [24]:
# Calculate accuracy and other relevant metrics
accuracy = accuracy_score(y_test, y_pred)
print("Best Hyperparameters:", best_params)
print("Best Model Accuracy:", accuracy)

Best Hyperparameters: {'kernel': 'linear', 'C': 10.0}
Best Model Accuracy: 0.9881588112375204


In [25]:
# Generate a classification report for more detailed evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
                          precision    recall  f1-score   support

            cancel_order       1.00      1.00      1.00        10
            change_order       0.99      0.98      0.99       183
 change_shipping_address       1.00      1.00      1.00        25
  check_cancellation_fee       1.00      1.00      1.00        66
          check_invoices       0.90      0.98      0.94       208
   check_payment_methods       0.98      0.96      0.97        52
     check_refund_policy       1.00      0.98      0.99        92
               complaint       1.00      1.00      1.00       146
contact_customer_service       1.00      1.00      1.00       430
     contact_human_agent       1.00      1.00      1.00       198
          create_account       0.99      1.00      0.99       367
          delete_account       0.99      0.98      0.98       204
        delivery_options       0.97      0.97      0.97        74
         delivery_period       1.00      0.97      0

#### Step 8: Save the best model

In [26]:
import joblib

# Save the trained model to a file 
joblib.dump(best_model, "models/intent_recognition_model.pkl")

# Save the TF-IDF vectorizer to a file
joblib.dump(tfidf_vectorizer_intent, "models/tfidf_vectorizer_intent.pkl")

['models/tfidf_vectorizer_intent.pkl']

### Category Classification Model

#### Step 1: Split the dataset into training, validation, and test sets

In [27]:
X = df['processed_utterance']  # Features (processed utterances)
y_category = df['category']  # Labels (high-level intent categories)
X_train, X_test, y_train_category, y_test_category = train_test_split(X, y_category, test_size=0.2, random_state=42)

#### Step 2: Convert processed utterances to numerical vectors using TF-IDF vectorizer

In [28]:
tfidf_vectorizer_category = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer_category.fit_transform(X_train.apply(lambda tokens: ' '.join(tokens)))
X_test_tfidf = tfidf_vectorizer_category.transform(X_test.apply(lambda tokens: ' '.join(tokens)))

#### Step 3: Define the hyperparameter grid for SVM

In [29]:
param_dist = {
    'C': np.logspace(-3, 3, 7),  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
}

#### Step 4: Initialize the SVM classifier

In [30]:
svm_classifier_category = SVC()

#### Step 5: Hyperparameter Optimization using RandomizedSearchCV

In [31]:
random_search_category = RandomizedSearchCV(svm_classifier_category, param_distributions=param_dist, n_iter=10, cv=3, random_state=42)
random_search_category.fit(X_train_tfidf, y_train_category)

#### Step 6: Get the best hyperparameters and the best model for category classification

In [32]:
best_params_category = random_search_category.best_params_
best_model_category = random_search_category.best_estimator_

#### Step 7: Evaluate the best model on the test set for category classification (Using best_model_category for prediction)


In [33]:
y_pred_category = best_model_category.predict(X_test_tfidf)

In [34]:
# Calculate accuracy and other relevant metrics for category classification
accuracy_category = accuracy_score(y_test_category, y_pred_category)
print("Best Hyperparameters for Category Classification:", best_params_category)
print("Category Classification Accuracy:", accuracy_category)

Best Hyperparameters for Category Classification: {'kernel': 'linear', 'C': 1000.0}
Category Classification Accuracy: 0.9960529370791734


In [35]:
# Generate a classification report for more detailed evaluation for category classification
print("Classification Report for Category Classification:")
print(classification_report(y_test_category, y_pred_category))

Classification Report for Category Classification:
                  precision    recall  f1-score   support

         ACCOUNT       1.00      1.00      1.00       904
CANCELLATION_FEE       1.00      0.98      0.99        66
         CONTACT       1.00      1.00      1.00       628
        DELIVERY       0.99      0.96      0.98       106
        FEEDBACK       1.00      1.00      1.00       247
        INVOICES       1.00      1.00      1.00       477
      NEWSLETTER       1.00      1.00      1.00        49
           ORDER       0.98      1.00      0.99       459
         PAYMENT       1.00      1.00      1.00       946
         REFUNDS       0.99      0.99      0.99       375
        SHIPPING       1.00      0.98      0.99        50

        accuracy                           1.00      4307
       macro avg       1.00      0.99      0.99      4307
    weighted avg       1.00      1.00      1.00      4307



#### Step 8: Save the best model

In [36]:
import joblib

# Save the trained model to a file
joblib.dump(best_model_category, "models/category_classification_model.pkl")

# Save the TF-IDF vectorizer to a file
joblib.dump(tfidf_vectorizer_category, "models/tfidf_vectorizer_category.pkl")

['models/tfidf_vectorizer_category.pkl']

### Integration with Virtual Assistant Framework

#### Step 1: Load the trained models

In [37]:
# Load the trained Intent Recognition model from the file
intent_recognition_model = joblib.load("models/intent_recognition_model.pkl")

# Load the TF-IDF vectorizer for Intent Recognition from the file
tfidf_vectorizer_intent = joblib.load("models/tfidf_vectorizer_intent.pkl")

In [38]:
# Define the intent responses for each intent category
intent_responses = {
    "cancel_order": "Your order has been canceled successfully.",
    "change_order": "Your order has been updated.",
    "change_shipping_address": "Your shipping address has been changed.",
    "check_cancellation_fee": "The cancellation fee for your order is $10.",
    "check_invoices": "You can view and download your invoices from your account dashboard.",
    "check_payment_methods": "We accept various payment methods, including credit cards, debit cards, and PayPal.",
    "check_refund_policy": "Our refund policy allows for full refunds within 30 days of purchase.",
    "complaint": "I'm sorry to hear that you're facing an issue. Please provide more details, and we'll assist you.",
    "contact_customer_service": "You can reach our customer service team at [Phone Number] or [Email Address].",
    "contact_human_agent": "One of our human agents will be happy to assist you shortly.",
    "create_account": "You can create an online account on our website. Please visit the 'Create Account' page to get started.",
    "delete_account": "Your account has been deleted successfully.",
    "delivery_options": "You can choose from standard or express shipping options during checkout.",
    "delivery_period": "The estimated delivery time for your order is 2-3 business days.",
    "edit_account": "You can edit your account information on the 'Account Settings' page.",
    "get_invoice": "You can view and download your invoice for the latest order in your account dashboard.",
    "get_refund": "Your refund request has been processed, and the amount will be credited back to your original payment method.",
    "newsletter_subscription": "Thank you for subscribing to our newsletter!",
    "payment_issue": "Please provide more details about the payment issue, and we'll assist you.",
    "place_order": "Your order has been successfully placed.",
    "recover_password": "You can reset your password by clicking on the 'Forgot Password' link on the login page.",
    "registration_problems": "I'm sorry to hear that you're experiencing registration issues. Please provide more details, and we'll assist you.",
    "review": "Thank you for your review! Your feedback is valuable to us.",
    "set_up_shipping_address": "You can add or update your shipping address in your account settings.",
    "switch_account": "You can switch to a different account by logging out and then logging in with the desired account credentials.",
    "track_order": "You can track your order using the tracking number provided in the order confirmation email.",
    "track_refund": "Your refund request is being processed, and you will receive a confirmation email once it's completed."
}


In [39]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Apply stemming or lemmatization (if needed)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens


#### Step 2: Create a function to handle user inputs and predict the intent

In [40]:
def predict_intent(user_input):
    processed_input = preprocess_text(user_input)
    input_tfidf = tfidf_vectorizer_intent.transform([' '.join(processed_input)])
    predicted_intent = intent_recognition_model.predict(input_tfidf)[0]
    return predicted_intent

#### Step 3: Create a function to predict the category

In [41]:
def predict_category(user_input):
    processed_input = preprocess_text(user_input)
    input_tfidf = tfidf_vectorizer_category.transform([' '.join(processed_input)])
    predicted_category = category_classification_model.predict(input_tfidf)[0]
    return predicted_category

#### Step 4: Implement the conversation flow

In [42]:
def virtual_assistant():
    print("Virtual Assistant: Hi! How can I assist you today?")
    
    while True:
        user_input = input("You: ")
        
        if user_input.lower() == "exit":
            print("Virtual Assistant: Goodbye!")
            break
        
        # Predict the user intent using the intent recognition model
        predicted_intent = predict_intent(user_input)
        
        # Predict the category for the user input using the category classification model
        predicted_category = predict_category(user_input)
        
        # Retrieve the response for the predicted intent based on the intent_responses dictionary
        response = intent_responses.get(predicted_intent, "I'm sorry, I don't understand.")
        
        # Print the response
        print("Virtual Assistant:", response)

#### Step 5: Run the virtual assistant

In [None]:
virtual_assistant()