In [8]:
import pandas as pd
import numpy as np
from pathlib import Path

# Path to your UTF-8 dataset
data_path = Path("..") / "data" / "raw" / "final_dataset_utf8.csv"

# Try UTF-8 first, fall back to Windows-friendly encoding if needed
try:
    df = pd.read_csv(data_path, encoding="utf-8")
except UnicodeDecodeError:
    df = pd.read_csv(data_path, encoding="latin1")  # or "cp1252"

print(f"Loaded dataset from: {data_path}")
print(f"Shape of dataset: {df.shape}")
print("Columns:", df.columns.tolist())

df.head()


Loaded dataset from: ..\data\raw\final_dataset_utf8.csv
Shape of dataset: (10400, 4)
Columns: ['SNO', 'text', 'category', 'priority']


Unnamed: 0,SNO,text,category,priority
0,1,I can't log into WiFi from my tablet. It retur...,Hardware Issue,critical
1,2,Database keeps crashing on my printer. Code ER...,Hardware Issue,high
2,3,Outlook fails to load on my laptop showing ERR42.,Hardware Issue,high
3,4,"I am unable to access Zoom on my desktop, it s...",Account/Access Issue,high
4,5,"I am unable to access CRM Portal on my laptop,...",Service Request,high


In [3]:
df = df.dropna(subset=["text", "category"])
df = df[df["text"].str.strip() != ""]


In [4]:
df["category"] = df["category"].astype(str).str.strip()

df["category"].value_counts()


category
Service Request         1679
Account/Access Issue    1669
Hardware Issue          1549
Security                1548
Network Problem         1403
Software Bug            1359
Other                   1193
Name: count, dtype: int64

In [5]:
category_mapping = {
    "hardware": "Hardware Issue",
    "hardware issue": "Hardware Issue",
    "service": "Service Request",
    "service request": "Service Request",
    "access": "Account/Access Issue",
    "account issue": "Account/Access Issue"
}

df["category"] = df["category"].str.lower().replace(category_mapping)


In [6]:
df["category"].value_counts()

category
Service Request         1679
account/access issue    1669
Hardware Issue          1549
security                1548
network problem         1403
software bug            1359
other                   1193
Name: count, dtype: int64

In [7]:
import sys
from pathlib import Path

sys.path.append(str(Path("..") / "scripts"))

from clean_text import clean_text

In [9]:
df["clean_text"] = df["text"].apply(clean_text)
df.head()

KeyboardInterrupt: 

In [None]:
import sys
from pathlib import Path

sys.path.append(str(Path("..") / "scripts"))

from clean_text import clean_text   # <-- underscore, no space

df["clean_text"] = df["text"].apply(clean_text)
df.head()


In [None]:
df["category"] = df["category"].str.strip().str.lower()

mapping = {
    "hardware issue": "Hardware Issue",
    "service request": "Service Request",
    "account/access issue": "Account/Access Issue",
    "access issue": "Account/Access Issue",
}

df["category"] = df["category"].replace(mapping)
df["category"].value_counts()
