# **Classifier**

In [28]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset



## **Data Loading**

In [2]:
complaints = pd.read_excel('EaUgXb.xlsx', index_col = 0)
clients = pd.read_excel('btUTgX.xlsx', index_col=0)

In [3]:
churn = clients['Churn']

In [4]:
complaints = complaints.merge(churn, left_on='customerID', right_index=True)
complaints.drop(columns=['complaint_number'], inplace=True)
complaints.rename(columns={'Churn': 'churn'}, inplace=True)
complaints['churn'] = complaints['churn'].map({'No': 0, 'Yes': 1})

In [5]:
complaints['churn'].value_counts(normalize=True)

churn
1    0.617445
0    0.382555
Name: proportion, dtype: float64

The classes are not balanced, the majority of clients who complain then churn. This however is reasonable so we will not try to rebalance the data and leave it like this. 

In [14]:
from sklearn.manifold import TSNE

# t-SNE to 3D
tsne = TSNE(n_components=3, random_state=42)
X_embedded = tsne.fit_transform(X)

## **Classifier**
Now we build a simple classifier that, based on the complaint, tries to predict whether that client will churn or not. We try the following models:
- Logistic Regression
- CatBoost
- Random Forest
- Small Neural Network

In [20]:
y = complaints['churn'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [21]:
log = LogisticRegression(max_iter=1000)
log.fit(X_train, y_train)

y_pred = log.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7133956386292835
Confusion Matrix:
 [[ 54  69]
 [ 23 175]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.44      0.54       123
           1       0.72      0.88      0.79       198

    accuracy                           0.71       321
   macro avg       0.71      0.66      0.67       321
weighted avg       0.71      0.71      0.70       321



In [24]:
cboost = CatBoostClassifier(verbose=0)  
cboost.fit(X_train, y_train)

y_pred = cboost.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7227414330218068
Confusion Matrix:
 [[ 62  61]
 [ 28 170]]
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.50      0.58       123
           1       0.74      0.86      0.79       198

    accuracy                           0.72       321
   macro avg       0.71      0.68      0.69       321
weighted avg       0.72      0.72      0.71       321



In [25]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7071651090342679
Confusion Matrix:
 [[ 60  63]
 [ 31 167]]
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.49      0.56       123
           1       0.73      0.84      0.78       198

    accuracy                           0.71       321
   macro avg       0.69      0.67      0.67       321
weighted avg       0.70      0.71      0.70       321



In [27]:
# --- Prepare data ---
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# --- Define small neural network ---
class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(X_train.shape[1], 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),
            
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            
            nn.Linear(128, 1),
            nn.Sigmoid()  
        )
    
    def forward(self, x):
        return self.net(x)

# --- Train ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NN().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

n_epochs = 20

for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device).unsqueeze(1)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {running_loss/len(train_loader):.4f}")

# --- Evaluation ---
model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds = (outputs.cpu().numpy() > 0.5).astype(int)
        y_pred.extend(preds.flatten())
        y_true.extend(labels.numpy())

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_true, y_pred))


Epoch [1/20], Loss: 0.6336
Epoch [2/20], Loss: 0.5533
Epoch [3/20], Loss: 0.4847
Epoch [4/20], Loss: 0.5008
Epoch [5/20], Loss: 0.4557
Epoch [6/20], Loss: 0.4388
Epoch [7/20], Loss: 0.4213
Epoch [8/20], Loss: 0.4084
Epoch [9/20], Loss: 0.4037
Epoch [10/20], Loss: 0.3845
Epoch [11/20], Loss: 0.3827
Epoch [12/20], Loss: 0.3559
Epoch [13/20], Loss: 0.3547
Epoch [14/20], Loss: 0.3570
Epoch [15/20], Loss: 0.3374
Epoch [16/20], Loss: 0.3171
Epoch [17/20], Loss: 0.3357
Epoch [18/20], Loss: 0.3044
Epoch [19/20], Loss: 0.3140
Epoch [20/20], Loss: 0.2985
Accuracy: 0.7507788161993769
Confusion Matrix:
 [[ 81  42]
 [ 38 160]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.68      0.66      0.67       123
         1.0       0.79      0.81      0.80       198

    accuracy                           0.75       321
   macro avg       0.74      0.73      0.73       321
weighted avg       0.75      0.75      0.75       321



The small neural network is the architecture that has the higher accuracy. However, the Logistic Regression is the one that has lowest false-negative rate. Since we want to have a classifier that, given an unseen complaint, tells us whether the client is likely to churn or not (so that we can intervene), we might choose the most cautious model, i.e. the Logistic Regression. As a matter of fact, in terms of costs for the company, it is much more expensive to lose a client rather than sending him immediate support to prevent him from churn.