In [18]:
import pandas as pd, numpy as np
import re, string, json
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
df = pd.read_excel("ai_dev_assignment_tickets_complex_1000.xlsx")
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ticket_id      1000 non-null   int64 
 1   ticket_text    945 non-null    object
 2   issue_type     924 non-null    object
 3   urgency_level  948 non-null    object
 4   product        1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


In [20]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['ticket_text'].apply(preprocess)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\naman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\naman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
df.dropna(subset=['ticket_text', 'issue_type', 'urgency_level', 'product'], inplace=True)

In [22]:
tfidf = TfidfVectorizer(max_features=500)
X_tfidf = tfidf.fit_transform(df['clean_text'])

In [23]:
df['text_len'] = df['ticket_text'].apply(len)
df['sentiment'] = df['ticket_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
X_extra = df[['text_len', 'sentiment']].values

In [24]:
from scipy.sparse import hstack
X_combined = hstack([X_tfidf, X_extra])

In [25]:
X_train, X_test, y1_train, y1_test = train_test_split(X_combined, df['issue_type'], test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(X_combined, df['urgency_level'], test_size=0.2, random_state=42)

In [26]:
issue_clf = RandomForestClassifier()
issue_clf.fit(X_train, y1_train)
y1_pred = issue_clf.predict(X_test)
print(classification_report(y1_test, y1_pred))

                    precision    recall  f1-score   support

    Account Access       1.00      1.00      1.00        23
   Billing Problem       1.00      1.00      1.00        19
   General Inquiry       1.00      1.00      1.00        25
Installation Issue       1.00      1.00      1.00        29
     Late Delivery       1.00      1.00      1.00        17
    Product Defect       1.00      1.00      1.00        30
        Wrong Item       1.00      1.00      1.00        23

          accuracy                           1.00       166
         macro avg       1.00      1.00      1.00       166
      weighted avg       1.00      1.00      1.00       166



In [27]:
urgency_clf = LogisticRegression(max_iter=500)
urgency_clf.fit(X_train, y2_train)
y2_pred = urgency_clf.predict(X_test)
print(classification_report(y2_test, y2_pred))

              precision    recall  f1-score   support

        High       0.40      0.36      0.38        66
         Low       0.29      0.35      0.32        43
      Medium       0.33      0.32      0.32        57

    accuracy                           0.34       166
   macro avg       0.34      0.34      0.34       166
weighted avg       0.35      0.34      0.34       166



In [28]:
complaint_keywords = ['broken', 'error', 'not working', 'late', 'damaged', 'failed']
date_pattern = r'\b(?:\d{1,2}[/-]\d{1,2}(?:[/-]\d{2,4})?)\b'

In [29]:
import joblib
joblib.dump(issue_clf, "issue_model.pkl")
joblib.dump(urgency_clf, "urgency_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [30]:
product_list = df['product'].dropna().unique().tolist()
with open("product_list.json", "w") as f:
    json.dump(product_list, f)

with open("complaint_keywords.json", "w") as f:
    json.dump(complaint_keywords, f)