In [None]:
# Display the column names of the DataFrame
for column_name in df.columns:
    print(column_name)

In [None]:
df.head()

#### Legend

| Column Name                                            | Explanation                                                                   |
|-------------------------------------------------------|-------------------------------------------------------------------------------|
| CLIENTNUM                                             | Client number. Unique identifier for the customer holding the account         |
| Attrition_Flag                                        | Internal event (customer activity) variable - if the account is closed then 1 else 0 |
| Customer_Age                                          | Demographic variable - Customer's Age in Years                                |
| Gender                                                | Demographic variable - M=Male, F=Female                                       |
| Dependent_count                                       | Demographic variable - Number of dependents                                    |
| Education_Level                                       | Demographic variable - Educational Qualification of the account holder (example: high school, college graduate, etc.) |
| Marital_Status                                        | Demographic variable - Married, Single, Divorced, Unknown                     |
| Income_Category                                       | Demographic variable - Annual Income Category of the account holder (< $40K, $40K - 60K, $60K - $80K, $80K-$120K, > $120K, Unknown) |
| Card_Category                                         | Product Variable - Type of Card (Blue, Silver, Gold, Platinum)                 |
| Months_on_book                                       | Period of relationship with bank                                              |
| Total_Relationship_Count                             | Total no. of products held by the customer                                    |
| Months_Inactive_12_mon                               | No. of months inactive in the last 12 months                                  |
| Contacts_Count_12_mon                                | No. of Contacts in the last 12 months                                         |
| Credit_Limit                                         | Credit Limit on the Credit Card                                               |
| Total_Revolving_Bal                                  | Total Revolving Balance on the Credit Card                                    |
| Avg_Open_To_Buy                                      | Open to Buy Credit Line (Average of last 12 months)                            |
| Total_Amt_Chng_Q4_Q1                                 | Change in Transaction Amount (Q4 over Q1)                                      |
| Total_Trans_Amt                                      | Total Transaction Amount (Last 12 months)                                      |
| Total_Trans_Ct                                       | Total Transaction Count (Last 12 months)                                      |
| Total_Ct_Chng_Q4_Q1                                  | Change in Transaction Count (Q4 over Q1)                                      |
| Avg_Utilization_Ratio                                | Average Card Utilization Ratio                                                |
| Naive_Bayes_Classifier_(...)_mon_1 | Naive Bayes |
| Naive_Bayes_Classifier_(...)_mon_2 | Naive Bayes |


#### 1. Prepare data

In [2]:
# Prepare the dataset for further analysis by reading the data into a DataFrame

import pandas as pd

file_path = '../data/BankChurners.csv'
df = pd.read_csv(file_path)

#### 2. Data cleaning

In [None]:
# Check for missing values, duplicates, and inconsistent data. Remove or impute missing values as needed.

#### 3. Feature selection

In [None]:
#  Identify relevant features for classification. Drop columns that are not useful or redundant for the analysis.

#### 4. Encoding categorical variables

In [None]:
# Convert categorical variables (e.g., Gender, Education_Level, Marital_Status, Income_Category, Card_Category) into numerical format using one-hot encoding

In [None]:
import matplotlib.pyplot as plt

# Select the columns of interest
selected_columns = ['Customer_Age', 'Dependent_count', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Attrition_Flag']
data_subset = df[selected_columns]

# Explore data statistics
data_statistics = data_subset.describe()

# Visualize the relationship between features and churn
plt.figure(figsize=(15, 10))

# Box plots for continuous variables
plt.subplot(2, 2, 1)
plt.boxplot(data_subset['Customer_Age'], vert=False)
plt.title('Customer Age Box Plot')

plt.subplot(2, 2, 2)
plt.boxplot(data_subset['Dependent_count'], vert=False)
plt.title('Dependent Count Box Plot')

plt.subplot(2, 2, 3)
plt.boxplot(data_subset['Total_Trans_Amt'], vert=False)
plt.title('Total Transaction Amount Box Plot')

plt.subplot(2, 2, 4)
plt.boxplot(data_subset['Total_Trans_Ct'], vert=False)
plt.title('Total Transaction Count Box Plot')

# Bar plot for churn
plt.figure(figsize=(8, 5))
churn_counts = data_subset['Attrition_Flag'].value_counts()
plt.bar(churn_counts.index, churn_counts.values)
plt.title('Churn Distribution')
plt.xlabel('Churn Status')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Explore data statistics
print("Data Statistics:")
print(data_statistics)

# Explore churn distribution
print("\nChurn Distribution:")
print(churn_counts)


In [None]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Select features and target variable
X = df[['Customer_Age', 'Dependent_count', 'Total_Trans_Amt', 'Total_Trans_Ct']]
y = df['Attrition_Flag']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation results
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)
