In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset


import re
import warnings
warnings.filterwarnings('ignore')

## This is primarily a text classification problem, we'll try to solve this using DistilBERT instead of Bag of Words / TF-IDF since the models used contain more general information about the world due to pretraining on real world data and hence usually perform better at text classification

In [2]:
calls_path="/kaggle/input/dtatanks/callsf0d4f5a.csv"
reasons_path="/kaggle/input/dtatanks/reason18315ff.csv"
test_path="/kaggle/input/dtatanks/testbc7185d.csv"

In [3]:
df_calls=pd.read_csv(calls_path)
df_reasons=pd.read_csv(reasons_path)
df_test=pd.read_csv(test_path)

In [4]:
df_calls = df_calls[['call_id', 'call_transcript']]
df_calls

Unnamed: 0,call_id,call_transcript
0,4667960400,\n\nAgent: Thank you for calling United Airlin...
1,1122072124,\n\nAgent: Thank you for calling United Airlin...
2,6834291559,\n\nAgent: Thank you for calling United Airlin...
3,2266439882,\n\nAgent: Thank you for calling United Airlin...
4,1211603231,\n\nAgent: Thank you for calling United Airlin...
...,...,...
71805,1563273072,\n\nAgent: Thank you for calling United Airlin...
71806,8865997781,\n\nAgent: Thank you for calling United Airlin...
71807,8019240181,\n\nAgent: Thank you for calling United Airlin...
71808,8332067080,\n\nAgent: Thank you for calling United Airlin...


In [5]:
df_reasons[:5]

Unnamed: 0,call_id,primary_call_reason
0,4667960400,Voluntary Cancel
1,1122072124,Booking
2,6834291559,IRROPS
3,2266439882,Upgrade
4,1211603231,Seating


In [6]:
df_test[:5]

Unnamed: 0,call_id
0,7732610078
1,2400299738
2,6533095063
3,7774450920
4,9214147168


In [7]:
from collections import Counter

In [8]:
Counter(df_reasons['primary_call_reason'])

Counter({'IRROPS': 13057,
         'Voluntary Change': 10291,
         'Seating': 6223,
         'Mileage Plus': 5487,
         'Post-Flight': 3869,
         'Communications': 3779,
         'Products and Services': 2792,
         'Upgrade': 2682,
         'Baggage': 2616,
         'Booking': 2589,
         'Checkout': 1840,
         'Check-In': 1490,
         'Voluntary Cancel': 1304,
         'Digital Support': 996,
         'ETC': 930,
         'Traveler Updates': 772,
         'Schedule Change': 707,
         'Other Topics': 568,
         'Products & Services': 476,
         'Disability': 394,
         'Check In': 385,
         'Post Flight': 373,
         'Voluntary  Cancel': 278,
         'IRROPS  ': 254,
         'Other  Topics': 234,
         'Voluntary Change  ': 208,
         'Voluntary   Change': 200,
         'Voluntary  Change': 149,
         'Seating  ': 142,
         'Digital   Support': 136,
         'Mileage  Plus': 135,
         'Baggage  ': 130,
         'Mileage   P

### names are repeating, need to preprocess it

In [9]:
df_reasons["primary_call_reason"] = (
    df_reasons["primary_call_reason"]
    .str.strip()
    .str.replace(r"\s+|-|&|and", " ", regex=True)
)

# Now, let's check the unique cleaned categories
uniques = np.unique(df_reasons["primary_call_reason"])
print(f'Number of unique primary call reasons: {len(uniques)}')
print('Unique primary call reasons:', uniques)

Number of unique primary call reasons: 20
Unique primary call reasons: ['Baggage' 'Booking' 'Check In' 'Checkout' 'Communications'
 'Digital Support' 'Disability' 'ETC' 'IRROPS' 'Mileage Plus'
 'Other Topics' 'Post Flight' 'Products   Services' 'Schedule Change'
 'Seating' 'Traveler Updates' 'Unaccompanied Minor' 'Upgrade'
 'Voluntary Cancel' 'Voluntary Change']


In [10]:
print(len(df_calls),len(df_reasons),len(df_test))

71810 66653 5157


In [11]:
a=len(df_calls)
b=len(df_reasons)
c=len(df_test)

a==b+c

True

## Text preprocessing on transcript data

In [12]:
df_calls['call_transcript'][0]

"\n\nAgent: Thank you for calling United Airlines customer service, my name is Sarah how may I help you?\n\nCustomer: Hi, yeah I'm calling because my flight from Chicago to New York was delayed by over 3 hours! This is ridiculous, I'm missing important meetings because of this. \n\nAgent: I'm so sorry to hear about the delay, that's definitely frustrating. Umm, let me pull up your reservation and take a look at what happened. *typing sounds* Okay, it looks like there was severe weather in Chicago that caused multiple flight cancellations and delays across the board for all the airlines. Ahh shoot, yeah your original flight was scheduled to depart at 2pm but didn't actually take off until after 5pm.\n\nCustomer: Ugh this is such poor planning on United's part, you should have rerouted passengers or put us on other flights that weren't delayed. Now I've wasted a whole day. \n\nAgent: I understand your frustration sir, delays are never fun. Let me see what options I have available to help

In [13]:
# Function to preprocess the call transcripts
def preprocess_transcript(transcript):
    # Remove agent/customer role mentions like 'Agent:' or 'Customer:'
    transcript = re.sub(r'(Agent:|Customer:)', '', transcript)
    
    # Remove non-verbal sounds or actions enclosed in asterisks (e.g., *typing sounds*)
    transcript = re.sub(r'\*.*?\*', '', transcript)
    
    # Remove extra newlines and trim leading/trailing spaces
    transcript = transcript.replace('\n', ' ').strip()
    
    # Remove multiple spaces
    transcript = re.sub(r'\s+', ' ', transcript)
    
    # Lowercase the transcript
    transcript = transcript.lower()
    
    return transcript

In [14]:
# Apply the preprocessing function to all call transcripts in df_calls
df_calls['call_transcript'] = df_calls['call_transcript'].apply(preprocess_transcript)

# Display the first cleaned transcript
print(df_calls['call_transcript'][0])

thank you for calling united airlines customer service, my name is sarah how may i help you? hi, yeah i'm calling because my flight from chicago to new york was delayed by over 3 hours! this is ridiculous, i'm missing important meetings because of this. i'm so sorry to hear about the delay, that's definitely frustrating. umm, let me pull up your reservation and take a look at what happened. okay, it looks like there was severe weather in chicago that caused multiple flight cancellations and delays across the board for all the airlines. ahh shoot, yeah your original flight was scheduled to depart at 2pm but didn't actually take off until after 5pm. ugh this is such poor planning on united's part, you should have rerouted passengers or put us on other flights that weren't delayed. now i've wasted a whole day. i understand your frustration sir, delays are never fun. let me see what options i have available to help make this up to you. hmm, it looks like i can get you booked on a flight le

### Inner Join `df_calls` and `df_reasons` based on `call_id` column to create train split

In [15]:
df_train = pd.merge(df_calls, df_reasons, on='call_id', how='inner')
df_train[:]

Unnamed: 0,call_id,call_transcript,primary_call_reason
0,4667960400,thank you for calling united airlines customer...,Voluntary Cancel
1,1122072124,"thank you for calling united airlines, my name...",Booking
2,6834291559,thank you for calling united airlines customer...,IRROPS
3,2266439882,thank you for calling united airlines customer...,Upgrade
4,1211603231,thank you for calling united airlines customer...,Seating
...,...,...,...
66648,7569738090,thank you for calling united airlines customer...,Mileage Plus
66649,1563273072,thank you for calling united airlines customer...,Post Flight
66650,8865997781,thank you for calling united airlines customer...,Upgrade
66651,8019240181,thank you for calling united airlines customer...,Upgrade


### Inner Join `df_test` and `df_calls` based on `call_id` column to create test split

In [16]:
df_test = pd.merge(df_test, df_calls, on='call_id', how='inner')
df_test

Unnamed: 0,call_id,call_transcript
0,7732610078,thank you for calling united airlines customer...
1,2400299738,"thank you for calling united airlines, my name..."
2,6533095063,thank you for calling united airlines customer...
3,7774450920,"thank you for calling united airlines, this is..."
4,9214147168,thank you for calling united airlines customer...
...,...,...
5152,5300201106,thank you for calling united airlines customer...
5153,727694488,"thank you for calling united airlines, my name..."
5154,147487837,thank you for calling united airlines customer...
5155,5330794838,"thank you for calling united airlines, my name..."


### As we can see here, we need to use `call_transcript` data for each row/record in df_test to predict `primary_call_reason` for each row

### Hence this is a text classification problem, next we'll label encode primary_call_reason to start classification process

In [17]:
# Initialize the Label Encoder
label_encoder = LabelEncoder()

# Fit and transform 'primary_call_reason' column into numerical labels
df_train['label'] = label_encoder.fit_transform(df_train['primary_call_reason'])

# Display the first few rows with encoded labels
df_train[['primary_call_reason', 'label']]

Unnamed: 0,primary_call_reason,label
0,Voluntary Cancel,18
1,Booking,1
2,IRROPS,8
3,Upgrade,17
4,Seating,14
...,...,...
66648,Mileage Plus,9
66649,Post Flight,11
66650,Upgrade,17
66651,Upgrade,17


In [18]:
from torch.utils.data import Dataset

In [19]:
class TextClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [20]:
# Load the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [21]:
# Tokenize the training data (ensure input sequences and attention masks are generated)
train_encodings = tokenizer(list(df_train['call_transcript']), truncation=True, padding=True, max_length=256)

# Split the data first
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_train['call_transcript'].tolist(),
    df_train['label'].values,
    test_size=0.2,
    random_state=42
)

# Then tokenize the split data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256)


In [22]:
# Create datasets
train_dataset = TextClassificationDataset(train_encodings, train_labels)
val_dataset = TextClassificationDataset(val_encodings, val_labels)

In [23]:
# Load the model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=len(set(train_labels))  # Use the number of unique labels
)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Start training
trainer.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································································


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 72
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113547511109169, max=1.0…

Epoch,Training Loss,Validation Loss
1,2.5436,2.530125
2,2.5834,2.502288
3,2.4392,2.488144


TrainOutput(global_step=9999, training_loss=2.519905738513915, metrics={'train_runtime': 2252.3698, 'train_samples_per_second': 71.021, 'train_steps_per_second': 4.439, 'total_flos': 1.059854103373824e+16, 'train_loss': 2.519905738513915, 'epoch': 3.0})

In [24]:
# Evaluate the model on the validation set
evaluation_results = trainer.evaluate()

print("Evaluation Results:")
for key, value in evaluation_results.items():
    print(f"{key}: {value}")
    break

Evaluation Results:
eval_loss: 2.4881443977355957
eval_runtime: 54.643
eval_samples_per_second: 243.965
eval_steps_per_second: 3.825
epoch: 3.0


In [25]:
# Make predictions on the validation dataset
val_predictions = trainer.predict(val_dataset)
val_pred_labels = np.argmax(val_predictions.predictions, axis=1)

# Load the true validation labels
true_val_labels = val_labels

# Calculate evaluation metrics
accuracy = accuracy_score(true_val_labels, val_pred_labels)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 23.42%


In [26]:
class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])


In [27]:

# Tokenize the test data
test_encodings = tokenizer(list(df_test['call_transcript']), truncation=True, padding=True, max_length=256)

# Create the test dataset
test_dataset = TestDataset(test_encodings)

# Make predictions
predictions = trainer.predict(test_dataset)

# Get the predicted labels
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Convert predicted labels back to original categories
predicted_categories = label_encoder.inverse_transform(predicted_labels.numpy())

# Display the predicted primary_call_reason for the test set
df_test['predicted_call_reason'] = predicted_categories
print(df_test[['call_id', 'predicted_call_reason']].head())

      call_id predicted_call_reason
0  7732610078      Voluntary Change
1  2400299738      Voluntary Change
2  6533095063      Voluntary Change
3  7774450920                IRROPS
4  9214147168                IRROPS


In [28]:
call_transcript = df_calls[df_calls['call_id'] == 6533095063]['call_transcript'].values[0]
print(call_transcript)

thank you for calling united airlines customer service, this is alex speaking. how may i help you today? hi alex, my name is john. i'm calling about a flight i have booked from chicago to san francisco next thursday. okay john, let me pull up your booking. what's the flight number? it's flight ua1268, departing at 10:30am. just one moment while i look that up...okay, i've found your reservation. what questions do you have about the flight? well, i've run into a little problem. my meeting in san francisco got pushed back a day so i need to change my return flight from friday to saturday. is there any way i can do that without paying a change fee? hmm, let me check the rules for that fare. unfortunately since it's within 7 days of travel, there would be a $200 change fee to switch you to the saturday flight. however, what i could do instead is give you a credit for the value of your original friday return that you could use for future travel within the next year. that way you wouldn't ha

In [29]:
df_test[:]

Unnamed: 0,call_id,call_transcript,predicted_call_reason
0,7732610078,thank you for calling united airlines customer...,Voluntary Change
1,2400299738,"thank you for calling united airlines, my name...",Voluntary Change
2,6533095063,thank you for calling united airlines customer...,Voluntary Change
3,7774450920,"thank you for calling united airlines, this is...",IRROPS
4,9214147168,thank you for calling united airlines customer...,IRROPS
...,...,...,...
5152,5300201106,thank you for calling united airlines customer...,Voluntary Change
5153,727694488,"thank you for calling united airlines, my name...",IRROPS
5154,147487837,thank you for calling united airlines customer...,Post Flight
5155,5330794838,"thank you for calling united airlines, my name...",Voluntary Change


In [34]:
df_test.to_csv("test.csv")

In [30]:
df_calls[:]

Unnamed: 0,call_id,call_transcript
0,4667960400,thank you for calling united airlines customer...
1,1122072124,"thank you for calling united airlines, my name..."
2,6834291559,thank you for calling united airlines customer...
3,2266439882,thank you for calling united airlines customer...
4,1211603231,thank you for calling united airlines customer...
...,...,...
71805,1563273072,thank you for calling united airlines customer...
71806,8865997781,thank you for calling united airlines customer...
71807,8019240181,thank you for calling united airlines customer...
71808,8332067080,thank you for calling united airlines customer...


In [31]:
import pandas as pd
import torch

# Create a copy of df_train
df_train2 = df_train.copy()

# Remove the primary call reason column
df_train2.drop(columns=['primary_call_reason'], inplace=True)

# Make predictions using your model (assuming your model is already set up and called `model`)
# Here we assume 'call_transcript' contains the input text for predictions
test_encodings = tokenizer(list(df_train2['call_transcript']), truncation=True, padding=True, max_length=256)
test_dataset = TextClassificationDataset(test_encodings, [0] * len(df_train2))  # Dummy labels, not used in prediction

# Make predictions
predictions = trainer.predict(test_dataset)

# Convert the predictions to a PyTorch tensor before using argmax
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Convert predicted labels back to original categories
predicted_categories = label_encoder.inverse_transform(predicted_labels.numpy())

# Add the predicted primary call reason to df_train2
df_train2['predicted_primary_call_reason'] = predicted_categories

# Inner join with original df_train to keep the correct primary call reasons
df_merged = pd.merge(df_train2, df_train[['call_id', 'primary_call_reason']], on='call_id', how='inner')

# Check how many predictions are correct
df_merged['is_correct'] = df_merged['predicted_primary_call_reason'] == df_merged['primary_call_reason']
correct_predictions_count = df_merged['is_correct'].sum()

# Display the count of correct predictions
print(f'Number of correct predictions: {correct_predictions_count}')

# Optionally, to view the merged DataFrame with correctness
df_merged[['call_id', 'primary_call_reason', 'predicted_primary_call_reason', 'is_correct']].head()


Number of correct predictions: 15755


Unnamed: 0,call_id,primary_call_reason,predicted_primary_call_reason,is_correct
0,4667960400,Voluntary Cancel,IRROPS,False
1,1122072124,Booking,IRROPS,False
2,6834291559,IRROPS,IRROPS,True
3,2266439882,Upgrade,IRROPS,False
4,1211603231,Seating,Voluntary Change,False


In [32]:
df_merged[:]

Unnamed: 0,call_id,call_transcript,label,predicted_primary_call_reason,primary_call_reason,is_correct
0,4667960400,thank you for calling united airlines customer...,18,IRROPS,Voluntary Cancel,False
1,1122072124,"thank you for calling united airlines, my name...",1,IRROPS,Booking,False
2,6834291559,thank you for calling united airlines customer...,8,IRROPS,IRROPS,True
3,2266439882,thank you for calling united airlines customer...,17,IRROPS,Upgrade,False
4,1211603231,thank you for calling united airlines customer...,14,Voluntary Change,Seating,False
...,...,...,...,...,...,...
66648,7569738090,thank you for calling united airlines customer...,9,Post Flight,Mileage Plus,False
66649,1563273072,thank you for calling united airlines customer...,11,Voluntary Change,Post Flight,False
66650,8865997781,thank you for calling united airlines customer...,17,IRROPS,Upgrade,False
66651,8019240181,thank you for calling united airlines customer...,17,IRROPS,Upgrade,False


In [33]:
len(np.unique(df_train['primary_call_reason']))

20

# Result isn't good enough using DistilBERT with 3 epochs, as we only get Accuracy: 22%, but still good enough since

In [37]:
df_test[:5]

Unnamed: 0,call_id,call_transcript,predicted_call_reason
0,7732610078,thank you for calling united airlines customer...,Voluntary Change
1,2400299738,"thank you for calling united airlines, my name...",Voluntary Change
2,6533095063,thank you for calling united airlines customer...,Voluntary Change
3,7774450920,"thank you for calling united airlines, this is...",IRROPS
4,9214147168,thank you for calling united airlines customer...,IRROPS


In [41]:
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Ensure val_texts and val_labels are prepared
# val_texts: List of validation feature inputs
# val_labels: List of true labels for the validation set

# Step 2: Make predictions on the validation set
# predictions = trainer.predict(val_texts)

# Step 3: Calculate accuracy
accuracy = accuracy_score(true_val_labels,val_pred_labels)
print(f'Validation Accuracy: {accuracy:.2f}')

# Generate classification report
report = classification_report(true_val_labels,val_pred_labels)
print('Classification Report:')
print(report)

Validation Accuracy: 0.23
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       604
           1       0.00      0.00      0.00       513
           2       0.00      0.00      0.00       359
           3       0.00      0.00      0.00       384
           4       0.00      0.00      0.00       757
           5       0.00      0.00      0.00       255
           6       0.00      0.00      0.00        86
           7       0.00      0.00      0.00       197
           8       0.28      0.71      0.40      2763
           9       0.23      0.04      0.07      1130
          10       0.00      0.00      0.00       174
          11       0.22      0.24      0.23       848
          12       0.00      0.00      0.00       658
          13       0.00      0.00      0.00       146
          14       0.50      0.00      0.00      1235
          15       0.00      0.00      0.00       190
          16       0.00      0.0

In [None]:
# Make predictions on the validation dataset
val_predictions = trainer.predict(val_dataset)
val_pred_labels = np.argmax(val_predictions.predictions, axis=1)

# Load the true validation labels
true_val_labels = val_labels

# Calculate evaluation metrics
accuracy = accuracy_score(true_val_labels, val_pred_labels)
print(f'Accuracy: {accuracy * 100:.2f}%')