<b><h2 style="text-align:center;">COMM493 - Coding AI for Business</h2></b>
<h5 style="text-align:center;">Assignment 2 - Text Classification Via Natural Language Processing</h5>
<h5 style="text-align:center;">Maxwell Brookes - 20244724</h5>
<h5 style="text-align:center;">March 1st, 2025</h5>

### Introduction
**Motivation:** TODO

**Data:** <a href="https://www.kaggle.com/datasets/abhishek14398/automatic-ticket-classification-dataset/">Automatic Ticket Classification Dataset</a> dataset contains ...
<a href="https://www.kaggle.com/code/abhishek14398/automatic-ticket-classification-case-study-nlp">implemenmtation</a>


**Goal:** Map each ticket onto its respective department/category. You can then use this data to train any supervised model such as logistic regression, decision tree or random forest. Using this trained model, you can classify any new customer complaint support ticket into its relevant department.

### 0: Set Up Environment

In [1]:
from datetime import datetime
import pandas as pd
import json
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import numpy as np
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

# Initialize SageMaker
sagemaker_session = sagemaker.Session()
role = get_execution_role()
bucket = sagemaker_session.default_bucket()
prefix = 'complaints-classification-optimized'

# Precompile regex patterns
REDACTION_PATTERNS = re.compile(
    r'\b(?:X+X|X{2,}(?:/X{2,})+|\d+[-/]?X+|X+[-/]?\d+|X{4,})\b', 
    flags=re.IGNORECASE
)
CLEANING_PATTERNS = [
    (re.compile(r'(\\[nt])+'), ' '),
    (re.compile(r'\$ ?(\d+)'), r'\1 dollars'),
    (re.compile(r'\b(\d+)(?:st|nd|rd|th)\b'), r'\1'),
    (re.compile(r'[^\w\s]'), '')
]



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


<b><h2 style="text-align:center;">DATA PREPROCESSING</h2></b>

### 1: Load dataset

In [2]:
with open('complaints.json', 'r') as f:
    data = json.load(f)
df = pd.json_normalize(data)
print('Data Shape:', df.shape)

### 2: Clean columns

In [None]:
# rename columns
df.rename(columns={
    '_source.product': 'category',
    '_source.complaint_what_happened': 'text'
}, inplace=True)

# drop columns
columns_to_keep = ['category', 'text']
all_columns = df.columns.tolist()
columns_to_drop = [col for col in all_columns if col not in columns_to_keep]
df.drop(columns_to_drop, axis=1, inplace=True)

df.columns

### 3: Clean rows and text

In [None]:
# null handling
df = df[
    df['text'].str.strip().astype(bool) &
    df['category'].notna() &
    df['category'].str.strip().astype(bool)
].copy()


# text cleaning
def clean_text_column(text_series):
    cleaned = text_series.str.replace(REDACTION_PATTERNS, '[REDACTED]')
    for pattern, replacement in CLEANING_PATTERNS:
        cleaned = cleaned.str.replace(pattern, replacement)
    return cleaned.str.lower().str.strip()


df['text'] = clean_text_column(df['text'])


# Stratified sampling with quantile-based balancing
category_counts = df['category'].value_counts()
min_samples = int(max(100, category_counts.quantile(0.85)))  # Ensure minimum 100 samples
balanced_dfs = []
for category, group in df.groupby('category'):
    if len(group) < min_samples:
        group = resample(group, replace=True, n_samples=min_samples, random_state=123)
    balanced_dfs.append(group)

df = pd.concat(balanced_dfs, ignore_index=True)
print(f"Balanced Data Shape: {df.shape}")

# Preview
df.head()

### 4: Split and upload

In [None]:
train_data, validation_data = train_test_split(
    df, 
    test_size=0.2, 
    random_state=123,
    stratify=df['category']
)

def format_blazingtext(df):
    return '\n'.join(
        f"__label__{cat.replace(' ', '_')} {txt}" 
        for cat, txt in zip(df['category'], df['text'])
    )

# Batch write formatted data
for name, data in [('train', train_data), ('validation', validation_data)]:
    with open(f'{name}.txt', 'w') as f:
        f.write(format_blazingtext(data))

<b><h2 style="text-align:center;">MODEL TRAINING</h2></b>

### 5: Upload Data to S3

In [None]:
version = datetime.now().strftime("%Y%m%d-%H%M")
s3_prefix = f"{prefix}/{version}"

sagemaker_session.upload_data('train.txt', bucket=bucket, key_prefix=f'{s3_prefix}/train')
sagemaker_session.upload_data('validation.txt', bucket=bucket, key_prefix=f'{s3_prefix}/validation')

### 6: Train the BlazingText Model

In [None]:
region_name = boto3.Session().region_name
container = get_image_uri(region_name, 'blazingtext')

# Enhanced hyperparameters
hyperparams = {
    "mode": "supervised",
    "epochs": 50,
    "learning_rate": 0.1,
    "min_count": 2,
    "vector_dim": 300,
    "word_ngrams": 3,
    "bucket": 200000,
    "early_stopping": True,
    "patience": 5,
    "threads": 8  # Utilize more CPU cores
}

# Configure estimator with optimized instance
bt_estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.xlarge',  # Better compute ratio
    output_path=f's3://{bucket}/{s3_prefix}/output',
    sagemaker_session=sagemaker_session,
    hyperparameters=hyperparams
)

# Start training with versioned data
bt_estimator.fit({'train': f's3://{bucket}/{s3_prefix}/train/train.txt',
                  'validation': f's3://{bucket}/{s3_prefix}/validation/validation.txt'})

### 7: Deploy the Model for Inference

In [None]:
bt_predictor = bt_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
    endpoint_name=f'complaint-classifier-{version}'
)

# Configure auto-scaling
client = boto3.client('application-autoscaling')
client.register_scalable_target(
    ServiceNamespace='sagemaker',
    ResourceId=f'endpoint/complaint-classifier-{version}/variant/AllTraffic',
    ScalableDimension='sagemaker:variant:DesiredInstanceCount',
    MinCapacity=1,
    MaxCapacity=3
)

### 8: Query Model

In [None]:
def clean_single_text(text):
    # Individual text processing
    if not isinstance(text, str):
        return ''
    cleaned = REDACTION_PATTERNS.sub('[REDACTED]', text)
    for pattern, replacement in CLEANING_PATTERNS:
        cleaned = pattern.sub(replacement, cleaned)
    return cleaned.lower().strip()


def predict_complaint(text):
    cleaned = clean_single_text(text)
    tokens = cleaned.split()[:512]  # Truncate to 512 tokens
    payload = {"instances": [tokens]}
    try:
        response = bt_predictor.predict(payload)
        return response[0]['label'].replace('__label__', '')
    except Exception as e:
        print(f"Prediction error: {str(e)}")
        return "Classification failed"


# Test with sample
sample = "Unauthorized $500 charge on XXXX-1234 account"
print(f"Original text: {sample}")
print(f"Cleaned text: {clean_single_text(sample)}")
print(f"Prediction: {predict_complaint(sample)}")

### 8: Clean Up

In [None]:
# Delete endpoint to avoid ongoing charges
bt_predictor.delete_endpoint()