1. Imports & Setup

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Reacher\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

2. Load Dataset

In [8]:
df = pd.read_csv("it_tickets_dataset.csv")
df.shape


(28587, 16)

3. Keep English Only

In [9]:
df = df[df['language'] == 'en'].copy()
df.shape


(16338, 16)

4. Prepare Text

In [10]:
df['subject'] = df['subject'].fillna("")
df['text'] = (df['subject'] + " " + df['body']).str.lower()


5. Base Sentiment Scoring (VADER)

In [11]:
sid = SentimentIntensityAnalyzer()

df['vader_score'] = df['text'].apply(
    lambda x: sid.polarity_scores(x)['compound']
)


6. Keyword-Based Sentiment Adjustment

In [12]:
negative_keywords = [
    'error', 'failed', 'not working', 'down', 'crash',
    'issue', 'problem', 'unable', 'broken'
]

positive_keywords = [
    'thanks', 'thank you', 'resolved', 'fixed',
    'working now', 'appreciate'
]

def adjust_score(text, score):
    for word in negative_keywords:
        if word in text:
            score -= 0.2
    for word in positive_keywords:
        if word in text:
            score += 0.2
    return score

df['adjusted_score'] = df.apply(
    lambda x: adjust_score(x['text'], x['vader_score']),
    axis=1
)


7. Create Sentiment Labels

In [13]:
def label_sentiment(score):
    if score <= -0.3:
        return "Negative"
    elif score >= 0.3:
        return "Positive"
    else:
        return "Neutral"

df['sentiment'] = df['adjusted_score'].apply(label_sentiment)


8. Urgency Label Mapping

In [14]:
priority_to_urgency = {
    'low': 'Low',
    'normal': 'Medium',
    'medium': 'Medium',
    'high': 'High',
    'urgent': 'High',
    'critical': 'High'
}

df['urgency'] = df['priority'].str.lower().map(priority_to_urgency)
df = df.dropna(subset=['urgency'])


9. EDA: Sentiment Distribution

In [15]:
df['sentiment'].value_counts()

sentiment
Positive    10634
Negative     3179
Neutral      2525
Name: count, dtype: int64

In [16]:
df['sentiment'].value_counts(normalize=True) * 100


sentiment
Positive    65.087526
Negative    19.457706
Neutral     15.454768
Name: proportion, dtype: float64

10. Text Length Analysis

In [17]:
df['urgency'].value_counts()


urgency
Medium    6618
High      6346
Low       3374
Name: count, dtype: int64

In [18]:
df['urgency'].value_counts(normalize=True) * 100

urgency
Medium    40.506794
High      38.841964
Low       20.651243
Name: proportion, dtype: float64

11. Text Length Analysis

In [19]:
df['text_length'] = df['text'].str.len()

df['text_length'].describe()



count    16338.000000
mean       405.538683
std        179.688949
min         19.000000
25%        255.000000
50%        416.000000
75%        566.000000
max       1189.000000
Name: text_length, dtype: float64

In [20]:
df.groupby('sentiment')['text_length'].mean()

sentiment
Negative    317.465555
Neutral     328.747327
Positive    450.101655
Name: text_length, dtype: float64

12. Sentiment vs Urgency Crosstab

In [21]:
pd.crosstab(df['sentiment'], df['urgency'], normalize='index') * 100


urgency,High,Low,Medium
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative,44.416483,16.703366,38.880151
Neutral,41.108911,18.376238,40.514851
Positive,36.637201,22.371638,40.99116


13. Common Words (NLP Insight)

In [22]:
vectorizer = CountVectorizer(stop_words='english', max_features=20)
X = vectorizer.fit_transform(df['text'])

vectorizer.get_feature_names_out()


array(['analytics', 'appreciate', 'assistance', 'customer', 'data',
       'digital', 'information', 'integration', 'investment', 'issue',
       'issues', 'marketing', 'medical', 'problem', 'project', 'provide',
       'security', 'software', 'support', 'tools'], dtype=object)

14. Final Sanity Check

In [23]:
df[['text', 'sentiment', 'urgency']].isna().sum()

text         0
sentiment    0
urgency      0
dtype: int64

15. Final Sanity Check

In [24]:
df.to_csv("english_tickets_sentiment_urgency_eda.csv", index=False)

16. Top Words in Negative Tickets

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

neg_df = df[df['sentiment'] == 'Negative'] # Check for Positive and Neutral
vectorizer = CountVectorizer(stop_words='english', max_features=20)
X_neg = vectorizer.fit_transform(neg_df['text'])

vectorizer.get_feature_names_out()


array(['analytics', 'assistance', 'data', 'devices', 'digital',
       'integration', 'investment', 'issue', 'issues', 'marketing',
       'network', 'problem', 'problems', 'project', 'recent', 'resolve',
       'restarting', 'software', 'support', 'updates'], dtype=object)

17. Select Features & Labels

In [26]:
X = df['text']
y = df['sentiment']

18. Train / Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

19. TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=10000,
    ngram_range=(1, 2)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

20. Train Logistic Regression Modelm

In [None]:
from sklearn.linear_model import LogisticRegression

sentiment_model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)

sentiment_model.fit(X_train_tfidf, y_train)

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",'balanced'
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


21. Predictions

In [None]:
y_pred = sentiment_model.predict(X_test_tfidf)

22. Evaluation Metrics

In [31]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.77      0.81      0.79       636
     Neutral       0.46      0.65      0.54       505
    Positive       0.97      0.86      0.91      2127

    accuracy                           0.81      3268
   macro avg       0.73      0.77      0.74      3268
weighted avg       0.85      0.81      0.83      3268



23. Confusion Matrix

In [32]:
import pandas as pd

cm = confusion_matrix(y_test, y_pred, labels=sentiment_model.classes_)
pd.DataFrame(cm, index=sentiment_model.classes_, columns=sentiment_model.classes_)


Unnamed: 0,Negative,Neutral,Positive
Negative,514,115,7
Neutral,123,328,54
Positive,33,275,1819


24. Save Model & Vectorizer

In [33]:
import joblib

joblib.dump(sentiment_model, "sentiment_model.pkl")
joblib.dump(tfidf, "sentiment_tfidf.pkl")


['sentiment_tfidf.pkl']

URGENCY CLASSIFICATION — MODEL TRAINING

25. Select Features & Labels

In [None]:
X = df['text']
y = df['urgency']

26. Train / Test Split

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


27. TF-IDF Vectorization

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_urgency = TfidfVectorizer(
    stop_words='english',
    max_features=10000,
    ngram_range=(1, 2)
)

X_train_tfidf = tfidf_urgency.fit_transform(X_train)
X_test_tfidf = tfidf_urgency.transform(X_test)


28. Train Logistic Regression Model

In [37]:
from sklearn.linear_model import LogisticRegression

urgency_model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)

urgency_model.fit(X_train_tfidf, y_train)


0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",'balanced'
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


29. Predictions

In [38]:
y_pred = urgency_model.predict(X_test_tfidf)


30. Evaluation Metrics

In [39]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        High       0.66      0.62      0.64      1269
         Low       0.48      0.59      0.53       675
      Medium       0.61      0.56      0.58      1324

    accuracy                           0.59      3268
   macro avg       0.58      0.59      0.58      3268
weighted avg       0.60      0.59      0.59      3268



31. Confusion Matrix

In [40]:

cm = confusion_matrix(y_test, y_pred, labels=urgency_model.classes_)
pd.DataFrame(cm, index=urgency_model.classes_, columns=urgency_model.classes_)

Unnamed: 0,High,Low,Medium
High,787,173,309
Low,97,401,177
Medium,311,265,748


32. Save Model & Vectorizer

In [41]:
joblib.dump(urgency_model, "urgency_model.pkl")
joblib.dump(tfidf_urgency, "urgency_tfidf.pkl")

['urgency_tfidf.pkl']

COMBINED PRIORITY SCORING SYSTEM

33. Load Saved Models

In [42]:

sentiment_model = joblib.load("sentiment_model.pkl")
sentiment_tfidf = joblib.load("sentiment_tfidf.pkl")

urgency_model = joblib.load("urgency_model.pkl")
urgency_tfidf = joblib.load("urgency_tfidf.pkl")

34. Prediction Functions

In [43]:
def predict_sentiment(text):
    X = sentiment_tfidf.transform([text.lower()])
    return sentiment_model.predict(X)[0]

def predict_urgency(text):
    X = urgency_tfidf.transform([text.lower()])
    return urgency_model.predict(X)[0]


35. Define Priority Weights

In [None]:
sentiment_weight = {
    'Negative': 2,
    'Neutral': 1,
    'Positive': 0
}

urgency_weight = {
    'High': 3,
    'Medium': 2,
    'Low': 1
}


36. Final Priority Score Function

In [45]:
def compute_priority(text):
    sent = predict_sentiment(text)
    urg = predict_urgency(text)

    score = sentiment_weight[sent] + urgency_weight[urg]

    if score >= 4:
        final_priority = "Critical"
    elif score == 3:
        final_priority = "High"
    elif score == 2:
        final_priority = "Medium"
    else:
        final_priority = "Low"

    return {
        "Sentiment": sent,
        "Urgency": urg,
        "Priority Score": score,
        "Final Priority": final_priority
    }


37. Test on Sample Tickets

In [46]:
sample_tickets = [
    "Server is down and users cannot access the system",
    "Please reset my password",
    "Thanks, issue has been resolved quickly",
    "Application is slow and intermittently failing"
]

for ticket in sample_tickets:
    print(ticket)
    print(compute_priority(ticket))
    print("-" * 50)


Server is down and users cannot access the system
{'Sentiment': 'Neutral', 'Urgency': 'Medium', 'Priority Score': 3, 'Final Priority': 'High'}
--------------------------------------------------
Please reset my password
{'Sentiment': 'Neutral', 'Urgency': 'High', 'Priority Score': 4, 'Final Priority': 'Critical'}
--------------------------------------------------
Thanks, issue has been resolved quickly
{'Sentiment': 'Neutral', 'Urgency': 'Medium', 'Priority Score': 3, 'Final Priority': 'High'}
--------------------------------------------------
Application is slow and intermittently failing
{'Sentiment': 'Neutral', 'Urgency': 'Low', 'Priority Score': 2, 'Final Priority': 'Medium'}
--------------------------------------------------
