In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset


import re
import warnings
warnings.filterwarnings('ignore')

## This is primarily a text classification problem

In [2]:
calls_path="/kaggle/input/textclass/calls.csv"
reasons_path="/kaggle/input/textclass/reason.csv"
test_path="/kaggle/input/textclass/test.csv"

In [3]:
df_calls=pd.read_csv(calls_path)
df_reasons=pd.read_csv(reasons_path)
df_test=pd.read_csv(test_path)

In [4]:
df_reasons["primary_call_reason"] = (
    df_reasons["primary_call_reason"]
    .str.strip()
    .str.replace(r"\s+|-|&|and", " ", regex=True)
)

# Now, let's check the unique cleaned categories
uniques = np.unique(df_reasons["primary_call_reason"])
print(f'Number of unique primary call reasons: {len(uniques)}')
print('Unique primary call reasons:', uniques)

Number of unique primary call reasons: 20
Unique primary call reasons: ['Baggage' 'Booking' 'Check In' 'Checkout' 'Communications'
 'Digital Support' 'Disability' 'ETC' 'IRROPS' 'Mileage Plus'
 'Other Topics' 'Post Flight' 'Products   Services' 'Schedule Change'
 'Seating' 'Traveler Updates' 'Unaccompanied Minor' 'Upgrade'
 'Voluntary Cancel' 'Voluntary Change']


In [5]:
df_calls = df_calls[['call_id', 'call_transcript']]
df_calls

Unnamed: 0,call_id,call_transcript
0,4667960400,\n\nAgent: Thank you for calling United Airlin...
1,1122072124,\n\nAgent: Thank you for calling United Airlin...
2,6834291559,\n\nAgent: Thank you for calling United Airlin...
3,2266439882,\n\nAgent: Thank you for calling United Airlin...
4,1211603231,\n\nAgent: Thank you for calling United Airlin...
...,...,...
71805,1563273072,\n\nAgent: Thank you for calling United Airlin...
71806,8865997781,\n\nAgent: Thank you for calling United Airlin...
71807,8019240181,\n\nAgent: Thank you for calling United Airlin...
71808,8332067080,\n\nAgent: Thank you for calling United Airlin...


In [6]:
df_reasons[:5]

Unnamed: 0,call_id,primary_call_reason
0,4667960400,Voluntary Cancel
1,1122072124,Booking
2,6834291559,IRROPS
3,2266439882,Upgrade
4,1211603231,Seating


In [7]:
df_test[:5]

Unnamed: 0,call_id
0,7732610078
1,2400299738
2,6533095063
3,7774450920
4,9214147168


In [8]:
print(len(df_calls),len(df_reasons),len(df_test))

71810 66653 5157


In [9]:
a=len(df_calls)
b=len(df_reasons)
c=len(df_test)

a==b+c

True

## Text preprocessing on transcript data

In [17]:
# Simplified preprocessing function
def preprocess_transcript(transcript):
    # Convert to lowercase
    transcript = transcript.lower()
    
    # Remove agent/customer role mentions
    transcript = re.sub(r'(agent:|customer:)', '', transcript)
    
    # Remove non-verbal sounds or actions enclosed in asterisks
    transcript = re.sub(r'\*.*?\*', '', transcript)
    
    # Remove extra newlines and trim leading/trailing spaces
    transcript = transcript.replace('\n', ' ').strip()
    
    # Remove multiple spaces
    transcript = re.sub(r'\s+', ' ', transcript)
    
    # Remove punctuation
    transcript = re.sub(r'[^\w\s]', '', transcript)
    
    # Tokenize the text
    tokens = word_tokenize(transcript)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Join tokens back into a string
    return ' '.join(tokens)

### Inner Join `df_calls` and `df_reasons` based on `call_id` column to create train split

In [13]:
df_train = pd.merge(df_calls, df_reasons, on='call_id', how='inner')
df_train[:]

Unnamed: 0,call_id,call_transcript,primary_call_reason
0,4667960400,thank you for calling united airlines customer...,Voluntary Cancel
1,1122072124,"thank you for calling united airlines, my name...",Booking
2,6834291559,thank you for calling united airlines customer...,IRROPS
3,2266439882,thank you for calling united airlines customer...,Upgrade
4,1211603231,thank you for calling united airlines customer...,Seating
...,...,...,...
66648,7569738090,thank you for calling united airlines customer...,Mileage Plus
66649,1563273072,thank you for calling united airlines customer...,Post Flight
66650,8865997781,thank you for calling united airlines customer...,Upgrade
66651,8019240181,thank you for calling united airlines customer...,Upgrade


### Inner Join `df_test` and `df_calls` based on `call_id` column to create test split

In [14]:
df_test = pd.merge(df_test, df_calls, on='call_id', how='inner')
df_test

Unnamed: 0,call_id,call_transcript
0,7732610078,thank you for calling united airlines customer...
1,2400299738,"thank you for calling united airlines, my name..."
2,6533095063,thank you for calling united airlines customer...
3,7774450920,"thank you for calling united airlines, this is..."
4,9214147168,thank you for calling united airlines customer...
...,...,...
5152,5300201106,thank you for calling united airlines customer...
5153,727694488,"thank you for calling united airlines, my name..."
5154,147487837,thank you for calling united airlines customer...
5155,5330794838,"thank you for calling united airlines, my name..."


### As we can see here, we need to use `call_transcript` data for each row/record in df_test to predict `primary_call_reason` for each row

### Hence this is a text classification problem, next we'll label encode primary_call_reason to start classification process

In [15]:
# Initialize the Label Encoder
label_encoder = LabelEncoder()

# Fit and transform 'primary_call_reason' column into numerical labels
df_train['label'] = label_encoder.fit_transform(df_train['primary_call_reason'])

# Display the first few rows with encoded labels
df_train[['primary_call_reason', 'label']]

Unnamed: 0,primary_call_reason,label
0,Voluntary Cancel,18
1,Booking,1
2,IRROPS,8
3,Upgrade,17
4,Seating,14
...,...,...
66648,Mileage Plus,9
66649,Post Flight,11
66650,Upgrade,17
66651,Upgrade,17


In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
# Apply preprocessing to train and test data
df_train['preprocessed_transcript'] = df_train['call_transcript'].apply(preprocess_transcript)
df_test['preprocessed_transcript'] = df_test['call_transcript'].apply(preprocess_transcript)

In [19]:
# Encode labels
label_encoder = LabelEncoder()
df_train['label'] = label_encoder.fit_transform(df_train['primary_call_reason'])

In [20]:
# Split the training data
X_train, X_val, y_train, y_val = train_test_split(df_train['preprocessed_transcript'], df_train['label'], test_size=0.2, random_state=42)

In [21]:
# Initialize vectorizers
bow_vectorizer = CountVectorizer(max_features=5000)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [22]:
# Fit and transform the training data
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [23]:
# Transform the validation data
X_val_bow = bow_vectorizer.transform(X_val)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

In [24]:
# Initialize classifiers
nb_bow = MultinomialNB()
nb_tfidf = MultinomialNB()
lr_bow = LogisticRegression(random_state=42, max_iter=1000)
lr_tfidf = LogisticRegression(random_state=42, max_iter=1000)
rf_bow = RandomForestClassifier(n_estimators=100, random_state=42)
rf_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)

In [34]:
# Train classifiers
nb_bow.fit(X_train_bow, y_train)

In [35]:
nb_tfidf.fit(X_train_tfidf, y_train)

In [36]:
lr_bow.fit(X_train_bow, y_train)

In [37]:
lr_tfidf.fit(X_train_tfidf, y_train)

In [42]:
rf_bow.fit(X_train_bow, y_train)

In [43]:
rf_tfidf.fit(X_train_tfidf, y_train)

In [26]:
# Evaluate classifiers
def evaluate_classifier(clf, X, y, name):
    y_pred = clf.predict(X)
    print(f"{name} Accuracy: {accuracy_score(y, y_pred):.4f}")
    print(f"{name} Classification Report:")
    print(classification_report(y, y_pred, target_names=label_encoder.classes_))

In [27]:
print("Evaluation on Validation Set:")
evaluate_classifier(nb_bow, X_val_bow, y_val, "Naive Bayes (BoW)")
evaluate_classifier(nb_tfidf, X_val_tfidf, y_val, "Naive Bayes (TF-IDF)")
evaluate_classifier(lr_bow, X_val_bow, y_val, "Logistic Regression (BoW)")
evaluate_classifier(lr_tfidf, X_val_tfidf, y_val, "Logistic Regression (TF-IDF)")
evaluate_classifier(rf_bow, X_val_bow, y_val, "Random Forest (BoW)")
evaluate_classifier(rf_tfidf, X_val_tfidf, y_val, "Random Forest (TF-IDF)")

Evaluation on Validation Set:
Naive Bayes (BoW) Accuracy: 0.1148
Naive Bayes (BoW) Classification Report:
                     precision    recall  f1-score   support

            Baggage       0.07      0.15      0.09       604
            Booking       0.05      0.12      0.07       513
           Check In       0.03      0.01      0.01       359
           Checkout       0.03      0.01      0.02       384
     Communications       0.06      0.04      0.05       757
    Digital Support       0.02      0.04      0.03       255
         Disability       0.02      0.01      0.01        86
                ETC       0.02      0.06      0.03       197
             IRROPS       0.25      0.23      0.24      2763
       Mileage Plus       0.11      0.14      0.12      1130
       Other Topics       0.02      0.05      0.02       174
        Post Flight       0.09      0.10      0.10       848
Products   Services       0.06      0.06      0.06       658
    Schedule Change       0.03      0.0

In [28]:
best_model = rf_tfidf
best_vectorizer = tfidf_vectorizer

In [29]:
# Predict on test set
X_test_vec = best_vectorizer.transform(df_test['preprocessed_transcript'])
y_test_pred = best_model.predict(X_test_vec)

In [30]:
# Add predictions to df_test
df_test['predicted_primary_call_reason'] = label_encoder.inverse_transform(y_test_pred)

In [31]:
# Display the first few rows of df_test with predictions
print("\nFirst few rows of df_test with predictions:")
print(df_test[['call_id', 'predicted_primary_call_reason']].head())


First few rows of df_test with predictions:
      call_id predicted_primary_call_reason
0  7732610078                        IRROPS
1  2400299738                        IRROPS
2  6533095063                        IRROPS
3  7774450920                        IRROPS
4  9214147168                        IRROPS


In [32]:
# Save the updated df_test to a CSV file
df_test.to_csv('df_test_with_predictions.csv', index=False)
print("\nUpdated df_test saved to 'df_test_with_predictions.csv'")


Updated df_test saved to 'df_test_with_predictions.csv'


### These results aren't good enough, since we barely go above 20% accuracy in all of these text classification models, so we need to try using transformers based text classifiers like BERT, DistilBERT etc instead in hopes for better prediction performance