In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score


In [3]:
df = pd.read_csv("../data/cleaned/final_dataset_cleaned.csv")
df.head()


Unnamed: 0,SNO,text,category,priority
0,1,I can't log into WiFi from my tablet. It retur...,Hardware,critical
1,2,Database keeps crashing on my printer. Code ER...,Hardware,high
2,3,Outlook fails to load on my laptop showing ERR42.,Hardware,high
3,4,"I am unable to access Zoom on my desktop, it s...",Account,high
4,5,"I am unable to access CRM Portal on my laptop,...",Other,high


In [5]:
df.columns


Index(['SNO', 'text', 'category', 'priority'], dtype='object')

In [6]:
import sys
sys.path.append("../scripts")

from clean_text import clean_text


In [7]:
df["clean_text"] = df["text"].apply(clean_text)
df[["text", "clean_text"]].head()


Unnamed: 0,text,clean_text
0,I can't log into WiFi from my tablet. It retur...,i can t log into wifi from my tablet it return...
1,Database keeps crashing on my printer. Code ER...,database keeps crashing on my printer code err...
2,Outlook fails to load on my laptop showing ERR42.,outlook fails to load on my laptop showing err42
3,"I am unable to access Zoom on my desktop, it s...",i am unable to access zoom on my desktop it sh...
4,"I am unable to access CRM Portal on my laptop,...",i am unable to access crm portal on my laptop ...


In [8]:
X = df["clean_text"]
y = df["category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [9]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.9,
    stop_words="english"
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [10]:
model = LinearSVC()

model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)


In [11]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.5240384615384616
              precision    recall  f1-score   support

     Account       0.53      0.61      0.57       643
    Hardware       0.47      0.45      0.46       310
     Network       0.54      0.46      0.50       281
       Other       0.53      0.58      0.56       574
    Software       0.53      0.36      0.43       272

    accuracy                           0.52      2080
   macro avg       0.52      0.49      0.50      2080
weighted avg       0.52      0.52      0.52      2080

