In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Load the dataset
df = pd.read_csv('shared/complaints_25Nov21.csv')

# Fill missing values
df = df.fillna('missing')

# Selecting predictor variables and target variable
X = df[['Product', 'Sub-product', 'Issue', 'State', 'Tags', 'Submitted via', 'Company response to consumer', 'Timely response?']]
y = df['Consumer disputed?']

# Convert categorical variables using get_dummies
X = pd.get_dummies(X, drop_first=True)

# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Check and balance the dataset
dispute_proportion = y_train.sum() / len(y_train)
if dispute_proportion < 0.3:
    undersampler = RandomUnderSampler(random_state=123)
    X_train, y_train = undersampler.fit_resample(X_train, y_train)

# Initialize and train the XGBoost Classifier
model_xgb = XGBClassifier(random_state=123)
model_xgb.fit(X_train, y_train)

# Predict on the test set
y_pred = model_xgb.predict(X_test)

# Generate classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Calculate cost without a model
cost_base_case = y_test.sum() * 600 + (len(y_test) - y_test.sum()) * 100

# Calculate cost with the model predictions
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
cost_model_no_diligence = tp * 600 + fn * 600 + tn * 100 + fp * 100
cost_model_with_diligence = tp * 190 + fn * 600 + tn * 100 + fp * 190

print("Base case cost:", cost_base_case)
print("Cost with model without extra diligence:", cost_model_no_diligence)
print("Cost with model with extra diligence:", cost_model_with_diligence)

# Optimize the threshold to minimize total cost
thresholds = np.linspace(0, 1, 101)
costs = []

for threshold in thresholds:
    y_pred_adjusted = (model_xgb.predict_proba(X_test)[:, 1] > threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_adjusted).ravel()
    cost = tp * 190 + fn * 600 + tn * 100 + fp * 190
    costs.append(cost)

              precision    recall  f1-score   support

           0       0.84      0.53      0.65     32504
           1       0.27      0.63      0.38      8948

    accuracy                           0.55     41452
   macro avg       0.55      0.58      0.51     41452
weighted avg       0.71      0.55      0.59     41452

[[17244 15260]
 [ 3346  5602]]
Base case cost: 8619200
Cost with model without extra diligence: 8619200
Cost with model with extra diligence: 7695780


In [5]:
min_cost = min(costs)
optimal_threshold = thresholds[costs.index(min_cost)]
print("Optimal threshold:", optimal_threshold)
print("Minimum cost with model:", min_cost)

Optimal threshold: 0.46
Minimum cost with model: 7589140


In [6]:
q1 = np.mean(y_test)
print(q1)

0.21586413200810575


In [7]:
# Generate classification report
report = classification_report(y_test, y_pred, target_names=le.classes_, output_dict=True)

# Recall for 'Consumer disputed?' = 'Yes'
recall_disputed_yes = report['Yes']['recall']
print(f"Recall for 'Consumer disputed?' = 'Yes': {recall_disputed_yes:.2f}")

Recall for 'Consumer disputed?' = 'Yes': 0.63


In [8]:
# Calculate the base costs for non-disputed and disputed complaints
cost_non_disputed = 100  # Cost for non-disputed complaints
cost_disputed = 600  # Cost for disputed complaints ($100 initial + $500 additional)

# Calculate the total cost without a model
total_cost_no_model = (y_test == 0).sum() * cost_non_disputed + (y_test == 1).sum() * cost_disputed

print(f"Total cost without a model: ${total_cost_no_model}")


Total cost without a model: $8619200


In [16]:
# Calculate cost with the model predictions, incorporating extra diligence
cost_with_diligence = fp * 190 + tp * 190 + fn * 600 + tn * 100

print("Cost with model with extra diligence:", cost_with_diligence)


Cost with model with extra diligence: 7875880


In [20]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Existing predictions
y_proba = model_xgb.predict_proba(X_test)[:, 1]

# Define the costs
cost_diligence = 190
cost_dispute = 600
cost_no_dispute = 100

# Initialize the list to store costs for each threshold
thresholds = np.linspace(0, 1, 101)
costs = []

# Loop through each threshold to calculate the cost
for threshold in thresholds:
    # Make predictions based on the threshold
    y_pred_threshold = (y_proba > threshold).astype(int)
    
    # Calculate confusion matrix components for the current threshold
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_threshold).ravel()
    
    # Calculate the cost for the current threshold
    cost = (tp + fp) * cost_diligence + fn * cost_dispute + tn * cost_no_dispute
    costs.append(cost)

# Identify the threshold with the minimum cost
min_cost = min(costs)
optimal_threshold = thresholds[costs.index(min_cost)]

print("Optimal threshold for decision:", optimal_threshold)
print("Minimum cost with model using optimal threshold:", min_cost)


Optimal threshold for decision: 0.46
Minimum cost with model using optimal threshold: 7589140
