In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

# Load data
df = pd.read_csv("BankChurners.csv")
df = df.iloc[:, 1:21]  # Select columns 2-21 (0-based index)

# Initial data exploration
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nData types and missing values:")
print(df.info())
print("\nSummary statistics:")
print(df.describe())

# Clean column names (remove special characters)
df.columns = df.columns.str.replace('[^A-Za-z0-9]+', '_')

# Drop the Naive Bayes columns as they seem to be model outputs
df = df.drop(columns=['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 
                     'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'])

# Attrition analysis
plt.figure(figsize=(8,5))
sns.countplot(x='Attrition_Flag', data=df)
plt.title('Customer Attrition Distribution')
plt.show()

# Numerical features analysis
numerical_cols = ['Customer_Age', 'Dependent_count', 'Months_on_book', 
                 'Total_Relationship_Count', 'Months_Inactive_12_mon', 
                 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
                 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
                 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

plt.figure(figsize=(12,8))
df[numerical_cols].hist(bins=20, figsize=(12,10))
plt.tight_layout()
plt.show()

# Categorical features analysis
categorical_cols = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']

plt.figure(figsize=(15,10))
for i, col in enumerate(categorical_cols, 1):
    plt.subplot(3,2,i)
    sns.countplot(x=col, hue='Attrition_Flag', data=df)
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Correlation analysis
plt.figure(figsize=(12,8))
corr_matrix = df[numerical_cols + ['Attrition_Flag']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Age vs Attrition
plt.figure(figsize=(10,6))
sns.boxplot(x='Attrition_Flag', y='Customer_Age', data=df)
plt.title('Age Distribution by Attrition Status')
plt.show()

# Credit utilization analysis
plt.figure(figsize=(10,6))
sns.boxplot(x='Attrition_Flag', y='Avg_Utilization_Ratio', data=df)
plt.title('Credit Utilization by Attrition Status')
plt.show()

# Transaction patterns
fig, ax = plt.subplots(1,2, figsize=(15,5))
sns.boxplot(x='Attrition_Flag', y='Total_Trans_Amt', data=df, ax=ax[0])
sns.boxplot(x='Attrition_Flag', y='Total_Trans_Ct', data=df, ax=ax[1])
plt.suptitle('Transaction Patterns by Attrition Status')
plt.show()

# Convert categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns.drop('Attrition_Flag')
for col in categorical_cols:
    df[col] = df[col].astype('category')

# Convert target variable
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Attrited Customer': 0, 'Existing Customer': 1})

# Split data
train_size = (len(bank) + 1) // 2
df_train = df.sample(n=train_size, random_state=2022)
df_test = df.drop(df_train.index)

# Prepare data for models
X_train = pd.get_dummies(df_train.drop('Attrition_Flag', axis=1), drop_first=True)
y_train = df_train['Attrition_Flag']
X_test = pd.get_dummies(df_test.drop('Attrition_Flag', axis=1), drop_first=True)
y_test = df_test['Attrition_Flag']

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
log_pred = log_reg.predict(X_test)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, log_pred):.4f}")

# LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
lda_pred = lda.predict(X_test)
print(f"LDA Accuracy: {accuracy_score(y_test, lda_pred):.4f}")

# QDA
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
qda_pred = qda.predict(X_test)
print(f"QDA Accuracy: {accuracy_score(y_test, qda_pred):.4f}")

# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print(f"KNN (k=5) Accuracy: {accuracy_score(y_test, knn_pred):.4f}")


# Decision Tree
tree = DecisionTreeClassifier(random_state=2022)
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)
print(f"Decision Tree Accuracy: {accuracy_score(y_test, tree_pred):.4f}")

# Random Forest
rf = RandomForestClassifier(random_state=2022)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_pred):.4f}")

# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, max_depth=4, random_state=2022)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)
print(f"Gradient Boosting Accuracy: {accuracy_score(y_test, gb_pred):.4f}")

# Handling unknown categories version
bank_clean = bank.replace('Unknown', np.nan).dropna()

# Repeat the same modeling process with cleaned data...

NameError: name 'df' is not defined