In [34]:
#install requirements
#The notebook relies on HuggingFace transformers for pretrained language models 
#and accelerate for compatibility and performance on Kaggle GPUs
!pip install -q transformers accelerate


In [35]:
#import necessary modules
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, r2_score

from transformers import DebertaV2Tokenizer, DebertaV2Model

#We use DeBERTa-v3-base, a transformer-based language model pretrained 
#on large-scale corpora.

In [36]:
#Device Setup and Random Seeds
#Automatically selects GPU (cuda) if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [37]:
#Load the training data
df = pd.read_csv("/kaggle/input/customer-complaints2/train_complaints.csv")
df.columns = df.columns.str.strip().str.lower()
df.head() #check the data

Unnamed: 0,complaint_id,complaint_text,primary_category,secondary_category,severity
0,1634299,Back into XXXX of 2010 during this mortgage cr...,Mortgage,"Loan modification,collection,foreclosure",2
1,5505088,I checked my credit report and I am upset on w...,"Credit reporting, credit repair services, or o...",Problem with a credit reporting company's inve...,1
2,10979675,I am writing to dispute the accuracy of the in...,Credit reporting or other personal consumer re...,Problem with a company's investigation into an...,1
3,7520351,A transaction from XXXX XXXX XXXX submitted a ...,Checking or savings account,Managing an account,1
4,5847870,I was recently alerted to an account in collec...,Debt collection,Attempts to collect debt not owed,5


In [38]:
#Encode target labels
#Convert categorical labels into numeric form.
#This step prepares labels for loss functions used during training.

primary_encoder = LabelEncoder()
secondary_encoder = LabelEncoder()

df["primary_label"] = primary_encoder.fit_transform(df["primary_category"])
df["secondary_label"] = secondary_encoder.fit_transform(df["secondary_category"])

num_primary = df["primary_label"].nunique()
num_secondary = df["secondary_label"].nunique()


In [39]:
#Prepare text tokenizer compatible with the model.
tokenizer = DebertaV2Tokenizer.from_pretrained(
    "microsoft/deberta-v3-base"
)

In [40]:
#Dataset class definition
#returns pre-tokenized inputs
#provides corresponding labels for all three tasks:
#primary category
#secondary category
#severity

class ComplaintDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=224):
        self.texts = df["complaint_text"].tolist()
        self.primary = df["primary_label"].values
        self.secondary = df["secondary_label"].values
        self.severity = df["severity"].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "primary": torch.tensor(self.primary[idx], dtype=torch.long),
            "secondary": torch.tensor(self.secondary[idx], dtype=torch.long),
            "severity": torch.tensor(self.severity[idx], dtype=torch.float),
        }


In [41]:
#Multi-Task DeBERTa Model Definition
#Define the neural network architecture.
#This multi-task setup allows shared representations across related tasks.
class MultiTaskDeBERTa(nn.Module):
    def __init__(self, num_primary, num_secondary):
        super().__init__()

        self.bert = DebertaV2Model.from_pretrained(
            "microsoft/deberta-v3-base"
           
        )

        hidden = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.3)
        #task specify heads-
        self.primary_head = nn.Linear(hidden, num_primary)
        self.secondary_head = nn.Linear(hidden, num_secondary)
        self.severity_head = nn.Linear(hidden, 1)

    def forward(self, input_ids, attention_mask):
        out = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        x = out.last_hidden_state[:, 0, :]
        x = self.dropout(x)

        return {
            "primary": self.primary_head(x),
            "secondary": self.secondary_head(x),
            "severity": self.severity_head(x).squeeze(1)
        }


In [42]:
#Dataloader initialization
#Create a batch-wise data loader for training.
n_folds = 4
batch_size = 8        # IMPORTANT: DeBERTa needs smaller batch
epochs = 5

skf = StratifiedKFold(
    n_splits=n_folds,
    shuffle=True,
    random_state=42
)


In [43]:
#Loss function and optimizer
#Purpose is to define how the model runs


for fold, (train_idx, val_idx) in enumerate(
    skf.split(df, df["primary_label"])
):
    print(f"\n===== Fold {fold+1}/{n_folds} =====")

    train_df = df.iloc[train_idx]
    val_df   = df.iloc[val_idx]

    train_ds = ComplaintDataset(train_df, tokenizer)
    val_ds   = ComplaintDataset(val_df, tokenizer)

    train_loader = DataLoader(
        train_ds, batch_size=batch_size, shuffle=True, num_workers=0
    )
    val_loader = DataLoader(
        val_ds, batch_size=batch_size, shuffle=False, num_workers=0
    )

    model = MultiTaskDeBERTa(num_primary, num_secondary).to(device)

    #Cross-entropy loss is used for both classification tasks
    loss_p = nn.CrossEntropyLoss()
    loss_s = nn.CrossEntropyLoss()
    #Mean squared error (MSE) is used for severity regression
    loss_v = nn.MSELoss()

    #Loss weights align with the competition scoring metric.
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    # ---- TRAIN ----
    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            p = batch["primary"].to(device)
            s = batch["secondary"].to(device)
            v = batch["severity"].to(device)

            out = model(input_ids, attention_mask)

            lp = loss_p(out["primary"], p)
            ls = loss_s(out["secondary"], s)
            lv = loss_v(out["severity"], v)

            loss = 0.3*lp + 0.5*ls + 0.2*lv
            loss.backward()
            optimizer.step()

   





===== Fold 1/4 =====

===== Fold 2/4 =====

===== Fold 4/4 =====


In [44]:
#Final model training.
#Train the final submission model on the full dataset.
'''Training loop performs:

forward pass

loss computation

backpropagation

weight updates'''


final_model = MultiTaskDeBERTa(num_primary, num_secondary).to(device)

optimizer = torch.optim.AdamW(final_model.parameters(), lr=2e-5)

full_ds = ComplaintDataset(df, tokenizer)
full_loader = DataLoader(full_ds, batch_size=batch_size, shuffle=True,num_workers=0)

final_model.train()
for epoch in range(epochs):
    for batch in full_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        p = batch["primary"].to(device)
        s = batch["secondary"].to(device)
        v = batch["severity"].to(device)

        out = final_model(input_ids, attention_mask)

        loss = (
            0.3*loss_p(out["primary"], p)
          + 0.4*loss_s(out["secondary"], s)
          + 0.2*loss_v(out["severity"], v)
        )

        loss.backward()
        optimizer.step()
torch.save(final_model.state_dict(), "final_model.pt")#save trained model. Ensures recovery in case of crash
#This is the actual learning step that produces the model used for submission.

In [45]:
#LOad the test data
test_df = pd.read_csv("/kaggle/input/customer-complaints2/test_complaints.csv")
test_df.columns = test_df.columns.str.strip().str.lower()


In [46]:
#Tokenize the test data
class TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = df["complaint_text"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

In [47]:
#Create a dataset and loader for inference.
test_loader = DataLoader(
    TestDataset(test_df, tokenizer),
    batch_size=batch_size,
    shuffle=False,#Batches are processed sequentially without shuffling.
    num_workers=0
)

final_model.eval()
#Generate predictions for all tasks
p_out, s_out, v_out = [], [], []

with torch.no_grad():
    for batch in test_loader:
        out = final_model(
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device)
        )
        p_out.extend(out["primary"].argmax(1).cpu().numpy())
        s_out.extend(out["secondary"].argmax(1).cpu().numpy())
        v_out.extend(out["severity"].cpu().numpy())

In [49]:
#Create the submission file
submission = pd.DataFrame({
    "complaint_id": test_df["complaint_id"],
    "primary_category": primary_encoder.inverse_transform(p_out),
    "secondary_category": secondary_encoder.inverse_transform(s_out),
    "severity": np.clip(np.rint(v_out), 1, 5).astype(int)#clipped to the valid range [1, 5]
})

submission.to_csv("submission4.csv", index=False)
