In [1]:
import pandas as pd

# Load the dataset
file_path = 'customer_support_tickets.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8469 entries, 0 to 8468
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Ticket ID           8469 non-null   int64 
 1   Customer Name       8469 non-null   object
 2   Ticket Type         8469 non-null   object
 3   Ticket Description  8469 non-null   object
 4   Ticket Priority     8469 non-null   object
dtypes: int64(1), object(4)
memory usage: 330.9+ KB


(   Ticket ID        Customer Name      Ticket Type  \
 0          1        Marisa Obrien  Technical issue   
 1          2         Jessica Rios  Technical issue   
 2          3  Christopher Robbins  Technical issue   
 3          4     Christina Dillon  Billing inquiry   
 4          5    Alexander Carroll  Billing inquiry   
 
                                   Ticket Description Ticket Priority  
 0  I'm having an issue with the {product_purchase...        Critical  
 1  I'm having an issue with the {product_purchase...        Critical  
 2  I'm facing a problem with my {product_purchase...             Low  
 3  I'm having an issue with the {product_purchase...             Low  
 4  I'm having an issue with the {product_purchase...             Low  ,
 None)

In [2]:
# Drop rows with missing values
data_cleaned = data.dropna()

# Check the cleaned data
data_cleaned.info(), data_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8469 entries, 0 to 8468
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Ticket ID           8469 non-null   int64 
 1   Customer Name       8469 non-null   object
 2   Ticket Type         8469 non-null   object
 3   Ticket Description  8469 non-null   object
 4   Ticket Priority     8469 non-null   object
dtypes: int64(1), object(4)
memory usage: 330.9+ KB


(None,
    Ticket ID        Customer Name      Ticket Type  \
 0          1        Marisa Obrien  Technical issue   
 1          2         Jessica Rios  Technical issue   
 2          3  Christopher Robbins  Technical issue   
 3          4     Christina Dillon  Billing inquiry   
 4          5    Alexander Carroll  Billing inquiry   
 
                                   Ticket Description Ticket Priority  
 0  I'm having an issue with the {product_purchase...        Critical  
 1  I'm having an issue with the {product_purchase...        Critical  
 2  I'm facing a problem with my {product_purchase...             Low  
 3  I'm having an issue with the {product_purchase...             Low  
 4  I'm having an issue with the {product_purchase...             Low  )

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import re

In [4]:
# Text preprocessing: Remove special characters and tokenize
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    return text

# Apply preprocessing to the 'Complaints' column
data_cleaned['Ticket Description'] = data_cleaned['Ticket Description'].apply(preprocess_text)

# Split the data into features (X) and target (y)
X = data_cleaned['Ticket Description']
y = data_cleaned['Ticket Type']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a pipeline with TF-IDF and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),  # TF-IDF Vectorizer
    ('clf', LogisticRegression())  # Logistic Regression Classifier
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report

(0.19421487603305784,
 '                      precision    recall  f1-score   support\n\n     Billing inquiry       0.18      0.13      0.15       357\nCancellation request       0.18      0.18      0.18       327\n     Product inquiry       0.18      0.17      0.17       316\n      Refund request       0.20      0.24      0.22       345\n     Technical issue       0.23      0.25      0.24       349\n\n            accuracy                           0.19      1694\n           macro avg       0.19      0.19      0.19      1694\n        weighted avg       0.19      0.19      0.19      1694\n')

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define a pipeline with TF-IDF and Random Forest Classifier
pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', RandomForestClassifier(random_state=42))
])

# Set up the parameter grid for hyperparameter tuning
param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

# Set up the GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline_rf, param_grid, cv=3, n_jobs=-1, verbose=2)

# Train the model with grid search
grid_search.fit(X_train, y_train)

# Get the best model after tuning
best_model = grid_search.best_estimator_

# Predict on the test set using the best model
y_pred_rf = best_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

accuracy_rf, report_rf, grid_search.best_params_



Fitting 3 folds for each of 108 candidates, totalling 324 fits


(0.2077922077922078,
 '                      precision    recall  f1-score   support\n\n     Billing inquiry       0.30      0.02      0.04       357\nCancellation request       0.18      0.11      0.14       327\n     Product inquiry       0.18      0.06      0.09       316\n      Refund request       0.21      0.50      0.30       345\n     Technical issue       0.21      0.34      0.26       349\n\n            accuracy                           0.21      1694\n           macro avg       0.22      0.21      0.16      1694\n        weighted avg       0.22      0.21      0.17      1694\n',
 {'clf__max_depth': 10,
  'clf__min_samples_leaf': 2,
  'clf__min_samples_split': 2,
  'clf__n_estimators': 300})

In [6]:
import pickle
# Assuming 'model' is your trained model
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [7]:
with open('best_model.pkl', 'rb') as file:
    model = pickle.load(file)

In [None]:
def predict_category(inquiry, model=best_model):
  """Predicts the category of a user's inquiry using the trained model."""
  preprocessed_inquiry = preprocess_text(inquiry)
  predicted_category = model.predict([preprocessed_inquiry])[0]
  return predicted_category


# Get user input
user_inquiry = input("Please enter your inquiry: ")

# Predict the category
predicted_category = predict_category(user_inquiry)

# Output the predicted category
print("Predicted Category:", predicted_category)

In [None]:
!pip install streamlit

In [None]:
!pip install --upgrade pip
!pip install streamlit


In [None]:
import sys
!{sys.executable} -m pip install streamlit


Collecting streamlit


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading streamlit-1.40.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Downloading cachetools-5.5.0-py3-none-any.whl.metadata (5.3 kB)
Collecting pyarrow>=7.0 (from streamlit)
  Downloading pyarrow-18.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<7,>=2.1.5 (f

In [None]:
import streamlit as st
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import pickle

# Save the best model
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [None]:
# Assuming `tfidf_vectorizer` is the TF-IDF vectorizer used in your pipeline
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(pipeline.named_steps['tfidf'], file)

In [None]:
# Load the saved model
with open('best_model.pkl', 'rb') as file:
    model = pickle.load(file)

# Load the vectorizer if saved separately
with open('tfidf_vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

In [None]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text
