In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder 

In [3]:
# load the data 
df = pd.read_csv('shared/complaints_25Nov21.csv')
print(df.head())

  Date received                  Product                   Sub-product  \
0    2016-10-26          Money transfers  International money transfer   
1    2015-03-27  Bank account or service    Other bank product/service   
2    2015-04-20  Bank account or service    Other bank product/service   
3    2013-04-29                 Mortgage   Conventional fixed mortgage   
4    2013-05-29                 Mortgage                Other mortgage   

                                      Issue Sub-issue  \
0                  Other transaction issues       NaN   
1   Account opening, closing, or management       NaN   
2  Making/receiving payments, sending money       NaN   
3  Application, originator, mortgage broker       NaN   
4  Loan modification,collection,foreclosure       NaN   

                        Consumer complaint narrative  \
0  To whom it concerns, I would like to file a fo...   
1  My name is XXXX XXXX XXXX and huband name is X...   
2  XXXX 2015 : I called to make a payment on

In [4]:
df_encoded = df.copy()

le = LabelEncoder()

# List of columns to encode
predictor_columns = ['Product', 'Sub-product', 'Issue', 'State', 'Tags', 'Submitted via', 'Company response to consumer', 'Timely response?']

# Apply LabelEncoder to each categorical column
for column in predictor_columns:
    df_encoded[column] = le.fit_transform(df_encoded[column].astype(str))


In [14]:
# Set X & y variables
predictor_columns = ['Product','Sub-product','Issue','State','Tags','Submitted via','Company response to consumer', 'Timely response?']
X= df_encoded[predictor_columns]
y=df_encoded['Consumer disputed?']


In [15]:
# Encode the 'Consumer disputed?' column 
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [16]:
#Split the data into a test and train set 
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=123)

In [17]:
#calculate the proportion of disputes in the test set  
dispute_proportion_test = y_test.mean()
#answer question 1
dispute_proportion_test

0.21586413200810575

In [18]:
#Question 2
#perform random undersampling 
from imblearn.under_sampling import RandomUnderSampler
undersampler = RandomUnderSampler(random_state=123)
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

In [19]:
#calculate the proportion of disputes
dispute_proportion_resampled = y_resampled.mean()
dispute_proportion_resampled

0.5

In [21]:
#Question 3
#train the XGBoost Classifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report 

# Set X & y variables
predictor_columns = ['Product','Sub-product','Issue','State','Tags','Submitted via','Company response to consumer', 'Timely response?']
X= df_encoded[predictor_columns]
y= df_encoded['Consumer disputed?']

le=LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=123)

undersampler = RandomUnderSampler(random_state=123)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

model_xgb = XGBClassifier(random_state =123)
model_xgb.fit(X_train_resampled, y_train_resampled)

In [22]:
#predict on the test set 
y_pred = model_xgb.predict(X_test)

#evaluate the recall for the "yes" class 
report = classification_report(y_test,y_pred, target_names= le.classes_)
report

'              precision    recall  f1-score   support\n\n          No       0.84      0.52      0.64     32504\n         Yes       0.27      0.63      0.38      8948\n\n    accuracy                           0.55     41452\n   macro avg       0.55      0.58      0.51     41452\nweighted avg       0.71      0.55      0.59     41452\n'

In [23]:
#question 4 
#without the model,what would the total cost to the banks to deal with the complaints in the test set? 

le = LabelEncoder()
df['Consumer disputed?'] = le.fit_transform(df['Consumer disputed?'])

_,X_test, _, y_test = train_test_split(df,df['Consumer disputed?'],test_size=0.2, random_state=123)

non_disputed_cost=100
disputed_cost=600
                                            
total_cost= (y_test==0).sum()*non_disputed_cost + (y_test==1).sum()*disputed_cost
total_cost

8619200

In [24]:
# Calculate the costs
from sklearn.metrics import confusion_matrix 
cost_of_diligence = 90
cost_of_non_disputed = 100
cost_of_disputed = 600

# Calculate the total cost based on the model predictions
cm = confusion_matrix(y_test, y_pred)
tp = cm[1, 1]  # True Positives: Disputed and predicted as disputed
fn = cm[1, 0]  # False Negatives: Disputed but not predicted as disputed
fp = cm[0, 1]  # False Positives: Not disputed but predicted as disputed
tn = cm[0, 0]  # True Negatives: Not disputed and not predicted as disputed

# Cost calculation
total_cost = tp*190 +fn*600 +fp*190 +tn*100
total_cost

7700620

In [25]:
# Cost calculation
total_cost = tp*(cost_of_diligence+cost_of_non_disputed) +fn*cost_of_disputed +fp*(cost_of_diligence+cost_of_non_disputed) +tn*cost_of_non_disputed
total_cost

7700620

In [27]:
import numpy as np

thresholds = np.linspace(0,1,101)
y_probs = model_xgb.predict_proba(X_test)[:1]
min_cost = float('inf')
optimal_thedshold = 0.5

for threshold in thresholds: 
    y_pred_adjusted = (y_probs> threshold).astype(int)
    cm_adjusted = confusion_matrix(y_test,y_pred_adjusted) 
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    cost = tp*190 +fn*600 +fp*190 +tn*100
    current_cost = calculate_cost(threshold, y_test, y_probs)
    if current_cost < min_cost:
        min_cost = current_cost
        optimal_thedshold = threshold
min_cost
optimal_thedshold

ValueError: Feature shape mismatch, expected: 8, got 18