In [3]:
import sys
from pathlib import Path

# current notebook path
current_path = Path().resolve()
print("Notebook path:", current_path)

# project root (one level UP from notebooks)
project_root = current_path.parents[0]

print("Project root:", project_root)

sys.path.insert(0, str(project_root))


Notebook path: C:\Users\zebaf\Documents\AI-Ticket-Project\notebooks
Project root: C:\Users\zebaf\Documents\AI-Ticket-Project


In [8]:
import pandas as pd
import numpy as np
from pathlib import Path

# Path to your UTF-8 dataset
data_path = Path("..") / "data" / "raw" / "final_dataset_utf8.csv"

# Try UTF-8 first, fall back to Windows-friendly encoding if needed
try:
    df = pd.read_csv(data_path, encoding="utf-8")
except UnicodeDecodeError:
    df = pd.read_csv(data_path, encoding="latin1")  # or "cp1252"

print(f"Loaded dataset from: {data_path}")
print(f"Shape of dataset: {df.shape}")
print("Columns:", df.columns.tolist())

df.head()


Loaded dataset from: ..\data\raw\final_dataset_utf8.csv
Shape of dataset: (10400, 4)
Columns: ['SNO', 'text', 'category', 'priority']


Unnamed: 0,SNO,text,category,priority
0,1,I can't log into WiFi from my tablet. It retur...,Hardware Issue,critical
1,2,Database keeps crashing on my printer. Code ER...,Hardware Issue,high
2,3,Outlook fails to load on my laptop showing ERR42.,Hardware Issue,high
3,4,"I am unable to access Zoom on my desktop, it s...",Account/Access Issue,high
4,5,"I am unable to access CRM Portal on my laptop,...",Service Request,high


In [9]:
df = df.dropna(subset=["text", "category"])
df = df[df["text"].str.strip() != ""]


In [10]:
df["category"] = df["category"].astype(str).str.strip()

df["category"].value_counts()


category
Service Request         1679
Account/Access Issue    1669
Hardware Issue          1549
Security                1548
Network Problem         1403
Software Bug            1359
Other                   1193
Name: count, dtype: int64

In [11]:
category_mapping = {
    "hardware": "Hardware Issue",
    "hardware issue": "Hardware Issue",
    "service": "Service Request",
    "service request": "Service Request",
    "access": "Account/Access Issue",
    "account issue": "Account/Access Issue"
}

df["category"] = df["category"].str.lower().replace(category_mapping)


In [6]:
df["category"].value_counts()

category
Service Request         1679
account/access issue    1669
Hardware Issue          1549
security                1548
network problem         1403
software bug            1359
other                   1193
Name: count, dtype: int64

In [7]:
import sys
from pathlib import Path

sys.path.append(str(Path("..") / "scripts"))

from clean_text import clean_text

In [13]:
df.columns.tolist()

['SNO', 'text', 'category', 'priority']

In [14]:
df['text_clean'] = df['text'].apply(clean_text)

In [15]:
df.head()

Unnamed: 0,SNO,text,category,priority,text_clean
0,1,I can't log into WiFi from my tablet. It retur...,Hardware Issue,critical,t log wifi tablet return err
1,2,Database keeps crashing on my printer. Code ER...,Hardware Issue,high,database keep crash printer code err
2,3,Outlook fails to load on my laptop showing ERR42.,Hardware Issue,high,outlook fail load laptop show err
3,4,"I am unable to access Zoom on my desktop, it s...",account/access issue,high,unable access zoom desktop show error err
4,5,"I am unable to access CRM Portal on my laptop,...",Service Request,high,unable access crm portal laptop show error x


In [None]:
import sys
from pathlib import Path

sys.path.append(str(Path("..") / "scripts"))

from clean_text import clean_text   # <-- underscore, no space

df["clean_text"] = df["text"].apply(clean_text)
df.head()


In [None]:
df["category"] = df["category"].str.strip().str.lower()

mapping = {
    "hardware issue": "Hardware Issue",
    "service request": "Service Request",
    "account/access issue": "Account/Access Issue",
    "access issue": "Account/Access Issue",
}

df["category"] = df["category"].replace(mapping)
df["category"].value_counts()


In [None]:
df.to_csv("../data/cleaned/cleaned_dataset.csv", index=False)

In [17]:
df['category'].value_counts()

category
Service Request         1679
account/access issue    1669
Hardware Issue          1549
security                1548
network problem         1403
software bug            1359
other                   1193
Name: count, dtype: int64

In [1]:
import pandas as pd

df = pd.read_csv("../data/raw/final_dataset_utf8.csv")


In [2]:
df["category"].value_counts()


category
Service Request         1679
Account/Access Issue    1669
Hardware Issue          1549
Security                1548
Network Problem         1403
Software Bug            1359
Other                   1193
Name: count, dtype: int64

In [3]:
CATEGORY_MAP = {
    "Network Problem": "Network",

    "Hardware Issue": "Hardware",

    "Software Bug": "Software",

    "Account/Access Issue": "Account",

    "Security": "Account",          # security â‰ˆ access/auth

    "Service Request": "Other",     # too broad

    "Other": "Other"
}

df["category"] = df["category"].map(CATEGORY_MAP)


In [4]:
df["category"].value_counts()


category
Account     3217
Other       2872
Hardware    1549
Network     1403
Software    1359
Name: count, dtype: int64

In [5]:
df.to_csv("../data/cleaned/final_dataset_cleaned.csv", index=False)


In [6]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    df,
    test_size=0.2,
    stratify=df["category"],
    random_state=42
)

train.to_csv("../data/splits/train.csv", index=False)
test.to_csv("../data/splits/test.csv", index=False)


In [4]:
from scripts.clean_text import clean_text


In [5]:
clean_text("My office WiFi keeps disconnecting")


'my office wifi keeps disconnecting'

In [7]:
import pandas as pd


In [8]:
df = pd.read_csv("../data/raw/final_dataset_utf8.csv")


In [9]:
df.head()


Unnamed: 0,SNO,text,category,priority
0,1,I can't log into WiFi from my tablet. It retur...,Hardware Issue,critical
1,2,Database keeps crashing on my printer. Code ER...,Hardware Issue,high
2,3,Outlook fails to load on my laptop showing ERR42.,Hardware Issue,high
3,4,"I am unable to access Zoom on my desktop, it s...",Account/Access Issue,high
4,5,"I am unable to access CRM Portal on my laptop,...",Service Request,high


In [10]:
df["text_clean"] = df["text"].astype(str).apply(clean_text)


In [11]:
df[["text", "text_clean"]].head()


Unnamed: 0,text,text_clean
0,I can't log into WiFi from my tablet. It retur...,i can t log into wifi from my tablet it return...
1,Database keeps crashing on my printer. Code ER...,database keeps crashing on my printer code err
2,Outlook fails to load on my laptop showing ERR42.,outlook fails to load on my laptop showing err
3,"I am unable to access Zoom on my desktop, it s...",i am unable to access zoom on my desktop it sh...
4,"I am unable to access CRM Portal on my laptop,...",i am unable to access crm portal on my laptop ...


In [12]:
df["category"].value_counts()


category
Service Request         1679
Account/Access Issue    1669
Hardware Issue          1549
Security                1548
Network Problem         1403
Software Bug            1359
Other                   1193
Name: count, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    df,
    test_size=0.2,
    stratify=df["category"],
    random_state=42
)

train.to_csv("../data/splits/train.csv", index=False)
test.to_csv("../data/splits/test.csv", index=False)


In [14]:
category_map = {
    "Hardware Issue": "Hardware",
    "Network Problem": "Network",
    "Software Bug": "Software",
    "Account/Access Issue": "Access",
    "Service Request": "Software",
    "Security": "Security",
    "Other": "Other"
}

df["category"] = df["category"].map(category_map)


FileNotFoundError: [Errno 2] No such file or directory: 'data/cleaned/final_dataset_cleaned.csv'