In [1]:
# SETUP: Run this cell first
!pip install gensim scikit-learn numpy matplotlib

import numpy as np
import gensim.downloader as api
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


print("Setup complete!")

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0
Setup complete!


In [2]:
# Load pre-trained Word2Vec embeddings (this may take 1-2 minutes)
print("Loading pre-trained embeddings...")
word_vectors = api.load('word2vec-google-news-300')
print(f"Loaded! Vocabulary size: {len(word_vectors)} words")
print(f"Each word is represented by a vector of {word_vectors.vector_size} numbers")

Loading pre-trained embeddings...
Loaded! Vocabulary size: 3000000 words
Each word is represented by a vector of 300 numbers


# Customer Service Chatbot using Word2Vec

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import re

df = pd.read_csv(
    "/content/drive/MyDrive/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv"
)

def clean(text):
    #Cleaning text by lowercasing, splitting, and removing non-lowercase and number characters
    text = text.lower()
    text = re.sub(r"[^a-z0-9 ]", "", text)
    return text.split()

#Clean and tokenize instruction sentences
df["tokens"] = df["instruction"].apply(clean)

In [28]:
from sklearn.model_selection import train_test_split

#Split into training and testing datasets to prevent data leakage and overfitting
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

In [7]:
from gensim.models import Word2Vec


#Initialize model, vector size set to 100 to capture nuanced semantic meaning without overfitting
#Set window to 5 because typical queries are short and repetitive
#min_count set to 2, so that unique word that appear less than 2 times are removed
#workers is simply cpu amount usage
model = Word2Vec(
    sentences=train_df["tokens"],
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)

In [8]:
import numpy as np

def sentence_vector(tokens):
    #Check if each token exists within Word2Vec, and retrieve its embedding
    vectors = [
        model.wv[word]
        for word in tokens
        if word in model.wv
    ]
    #Non matches have no value, prevents breaking
    if len(vectors) == 0:
        return np.zeros(100)

    return np.mean(vectors, axis=0)

In [20]:
train_df["vec"] = train_df["tokens"].apply(sentence_vector)
test_df["vec"]  = test_df["tokens"].apply(sentence_vector)

X_train = np.vstack(train_df["vec"].values)
X_test  = np.vstack(test_df["vec"].values)

In [10]:
#Order number extraction & standardization
import re

def extract_order_number(text):

    matches = re.findall(r'\b[A-Z]*\d+[A-Z0-9]*\b', text.upper())

    if matches:
        return matches[0]

    return None

In [11]:
#Fake order numbers for testing
orders = {
    "58219": {
        "status": "Shipped",
        "delivery": "Feb 25",
        "item": "Laptop",
        "refundable": False
    },
    "ZX00122": {
        "status": "Delivered",
        "delivery": "Feb 20",
        "item": "Headphones",
        "refundable": True
    },
    "ABX9123": {
        "status": "Processing",
        "delivery": "Feb 27",
        "item": "Monitor",
        "refundable": True
    }
}

In [12]:

ORDER_REQUIRED_INTENTS = [
    "track_order",
    "cancel_order",
    "refund_order",
    "order_status",
    "update_address"
]

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

STOPWORDS = ["order"]

def predict_intent(user_input):

    #Extract order number
    order_num = extract_order_number(user_input)

    cleaned_input = user_input
    if order_num:
        cleaned_input = cleaned_input.replace(order_num, "")

    tokens = [
        t for t in clean(cleaned_input)
        if t not in STOPWORDS
    ]

    user_vec = sentence_vector(tokens).reshape(1, -1)

    sims = cosine_similarity(user_vec, X_train)
    best_match = sims.argmax()

    return train_df.iloc[best_match]["intent"]

In [22]:
predict_intent("cancel order 58219")

'cancel_order'

In [24]:
y_true = []
y_pred = []

for i, row in test_df.iterrows():

    #rebuild sentence from tokens
    test_sentence = " ".join(row["tokens"])

    pred = predict_intent(test_sentence)

    y_true.append(row["intent"])
    y_pred.append(pred)

In [25]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_true, y_pred)
print("Intent Classification Accuracy:", acc)

Intent Classification Accuracy: 0.9484651162790698


In [27]:
def chatbot(user_input):

    order_num = extract_order_number(user_input)

    #Predict intent
    intent = predict_intent(user_input)

    #Retrieve reponse
    response = train_df[
        train_df["intent"] == intent
    ].iloc[0]["response"]

    if intent in ORDER_REQUIRED_INTENTS:

        if not order_num:
            return "Please provide your order number so I can assist you."

        if order_num not in orders:
            return f"I couldn't find order {order_num}. Please double check it."

        order = orders[order_num]

        if intent == "track_order":
            return f"Order {order_num} is currently {order['status']} and expected by {order['delivery']}."

        if intent == "cancel_order":
            if order["status"] == "Shipped":
                return f"Order {order_num} has already shipped and cannot be cancelled."
            else:
                return f"Order {order_num} has been cancelled successfully."

        if intent == "refund_order":
            if not order["refundable"]:
                return f"Order {order_num} is not eligible for refund."
            else:
                return f"Refund for order {order_num} has been initiated."

        if intent == "order_status":
            return f"Order {order_num} contains a {order['item']} and is currently {order['status']}."

    return response

In [29]:
chatbot('cancel order 58219')

'Order 58219 has already shipped and cannot be cancelled.'