In [38]:
#Bank User Churn Project
#Authors: Flora and Adriana

import numpy as np
from sklearn.model_selection import train_test_split
import csv

Geography = {"France": 0.0, "Spain": 1.0, "Germany": 2.0}
Gender = {"Female": 0.0, "Male": 1}

data = [] 

with open('Churn_Modelling.csv', 'r') as read_obj:
    csv_reader = csv.reader(read_obj)
    next(csv_reader)  # Skip header row

    for row in csv_reader:
        row = row[3:]  # Skip first 3 columns
        
        row[1] = Geography[row[1]]   # Geography
        row[2] = Gender[row[2]]      # Gender

        data.append(row)

# Convert to NumPy array and cast to float32
data = np.array(data).astype(np.float32)

# Separate features and labels
X = data[:, :-1]  # All columns except the last
y = data[:, -1]   # Last column is label

# Split data
X_train, X_t, y_train, y_t = train_test_split(X, y, test_size=0.30, random_state=0)
X_validation, X_test, y_validation, y_test = train_test_split(X_t, y_t, test_size=0.5, random_state=0)

# Print shapes
print(X_train.shape)
print(X_test.shape)
print(X_validation.shape)


(7000, 10)
(1500, 10)
(1500, 10)


In [39]:
#find biases
import pandas as pd

# Retrieve gender statistics
df = pd.read_csv('Churn_Modelling.csv')
print(df['Gender'].value_counts())

#Retrive credit score statics
above_650 = (df['CreditScore'] > 650).sum()
below_or_equal_650 = (df['CreditScore'] <= 650).sum()

print(f"Users with Credit Score > 650: {above_650}")
print(f"Users with Credit Score ≤ 650: {below_or_equal_650}")

Gender
Male      5457
Female    4543
Name: count, dtype: int64
Users with Credit Score > 650: 5063
Users with Credit Score ≤ 650: 4937


In [40]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_validation_scaled = scaler.transform(X_validation)

In [41]:

#knn
from sklearn.neighbors import KNeighborsClassifier

def get_accuracy(y_true, y_predicted):
    """returns the fraction of correct predictions in y_predicted compared to y_true"""
    counter= 0
    for x in range(len(y_true)):
        if (y_true[x] == y_predicted[x]):
            counter+= 1

    accuracy= counter/ len(y_true)
    
    return accuracy


def sklearn_knn_predict(trainX, trainy, testX, distance_metric, k):
    model = KNeighborsClassifier(algorithm='brute', n_neighbors= k, metric= distance_metric)
    model.fit(trainX, trainy)
    predicted = model.predict(testX)

    return predicted

def knn_grid_search(trainX, trainy, validationX, validationy, distance_metric_list, n_neighbors_list):
    """For each metric in distance_metric_list "euclidean, manhattan, etc", and each value k in n_neighbors_list,
    trains knn classifiers with those parameters
    on the training data and computes the accuracy on the validation data.
    Returns a dictionary mapping each value of the hyperparameter pair (metric, k)
    to the accuracy with those hyperparameters on the validation data
    """
    results={}
    for k in n_neighbors_list:
        
        for d in distance_metric_list:
            accuracy= get_accuracy(validationy, sklearn_knn_predict(trainX,  trainy, validationX, d, k))
            results[d, k] = accuracy
    return results

In [42]:
#continuation

k_values= [1,3,5,7,9,11,13,15,17,19]

grid= knn_grid_search(X_train, y_train, X_validation, y_validation, ['euclidean','manhattan'], k_values)

highest = max(grid, key=grid.get)


metric, k = highest[0], highest[1]
validation_accuracy = grid[highest]
print('The best parameters are metric =', metric, 'and k =', k, 'with', validation_accuracy, 'accuracy on the validation data')



test_accuracy = get_accuracy(y_test, sklearn_knn_predict(X_train, y_train, X_test, metric, k) )
print('Test accuracy:', test_accuracy)

The best parameters are metric = euclidean and k = 19 with 0.7873333333333333 accuracy on the validation data
Test accuracy: 0.7953333333333333


In [43]:
from sklearn import ensemble 
clf = ensemble.RandomForestClassifier(random_state=0, class_weight='balanced')
clf.fit(X_train, y_train)
print(clf.predict(X_test))

print("Validation score on Random Forest:")
print(clf.score(X_validation, y_validation))

print("Test score on Random Forest:")
print(clf.score(X_test, y_test))

[0. 1. 0. ... 0. 0. 1.]
Validation score on Random Forest:
0.854
Test score on Random Forest:
0.8733333333333333


In [44]:
#Random Forest Feature Importance
import pandas as pd
feature_names = [ 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 
    'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'
]
print("X shape:", X.shape)
print("Feature importances shape:", clf.feature_importances_.shape)
print("Feature names count:", len(feature_names))
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
sorted_features = [(feature_names[i], importances[i]) for i in indices]

print("Feature Importances:")
for name, score in sorted_features:
    print("{name}:{score}".format(name=name, score=score))


X shape: (10000, 10)
Feature importances shape: (10,)
Feature names count: 10
Feature Importances:
Age:0.249691782388163
Balance:0.14511338510653754
EstimatedSalary:0.13969947380554415
CreditScore:0.1355087349020238
NumOfProducts:0.1313417475040262
Tenure:0.07783857740800411
Geography:0.04733175800436427
IsActiveMember:0.033496115204982135
Gender:0.02157456166202002
HasCrCard:0.018403864014334862


In [45]:
#Logistic regression and weight extraction
from sklearn import linear_model  
logRegres = linear_model.LogisticRegression(random_state=0, class_weight='balanced')
logRegres.fit(X_train_scaled, y_train )
print("Validation score on Logistic Regression:")
print(logRegres.score(X_validation_scaled, y_validation))

print("Test score on Logistic regression:")
print(logRegres.score(X_test_scaled, y_test))


Validation score on Logistic Regression:
0.6966666666666667
Test score on Logistic regression:
0.6986666666666667


In [46]:
# Get Logistic Regression weights
from sklearn.feature_selection import f_regression
weights = logRegres.coef_
sorted_weights = np.argsort(weights)  # Sort
print("Feature Weights:")
print(weights)
#P-values for each 
print("P-value for each feature")
print(f_regression(X_train_scaled, y_train)[1])

Feature Weights:
[[-0.09302871  0.30914394 -0.28443939  0.8086866  -0.08124474  0.21334037
  -0.08732669 -0.01431191 -0.42580662  0.03690607]]
P-value for each feature
[1.73892188e-003 1.27677977e-040 3.32366810e-019 1.03577361e-128
 2.14166317e-002 1.05879654e-024 1.42532740e-007 4.42720255e-001
 5.80328653e-033 2.30693167e-001]


In [47]:
#Assuming the data is Non-linear
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
accuracies_val = []
accuracies_test = []

# for degree in range(1,11):
#     poly = PolynomialFeatures(degree=degree)
#     model = LogisticRegression(max_iter=20000)
#     X_train_poly = poly.fit_transform(X_train)
#     X_test_poly = poly.transform(X_test)
#     X_validation_poly = poly.transform(X_validation)
#     model.fit(X_train_poly, y_train)
#     y_val = model.predict(X_validation_poly)
#     accuracies_val.append((degree, accuracy_score(y_val, y_validation)))
#     y_prediction = model.predict(X_test_poly)
#     accuracies_test.append((degree, accuracy_score(y_prediction, y_test)))
# print('Accuracies on Validation:' + str(accuracies_val))
# print('Accuracies on Validation:' + str(accuracies_test))


In [48]:
#Experiment with f1 score
from sklearn.metrics import f1_score
#predictions with random forest
y_pred_val  = clf.predict(X_validation)
y_pred_test  = clf.predict(X_test)
#calculate the scores
f1_val = f1_score(y_validation, y_pred_val)
f1_test = f1_score(y_test, y_pred_test)
print("F1 Score on validation data:", f1_val)
print("F1 Score on testing:", f1_test)

F1 Score on validation data: 0.5796545105566219
F1 Score on testing: 0.62


In [49]:
#THIS WAS JUST TO TEST OUR PREDICTIONS INSTEAD OF INPUTTING A WHOLE DATA SET FOR THINGS WE WANT PREDICTED


# Age 20
F1 = [500, 0, 1, 20, 5, 120000.0, 1, 1, 1, 175000.0]  # High balance, high salary, 1 product
F2 = [500, 0, 1, 20, 5, 30000.0, 1, 1, 1, 50000.0]    # Low balance, low salary, 1 product
F3 = [500, 0, 1, 20, 5, 120000.0, 4, 1, 1, 175000.0]  # High balance, high salary, 4 products
F4 = [500, 0, 1, 20, 5, 30000.0, 4, 1, 1, 50000.0]    # Low balance, low salary, 4 products

# Age 45
F5 = [500, 0, 1, 45, 5, 120000.0, 1, 1, 1, 175000.0]
F6 = [500, 0, 1, 45, 5, 30000.0, 1, 1, 1, 50000.0]
F7 = [500, 0, 1, 45, 5, 120000.0, 4, 1, 1, 175000.0]
F8 = [500, 0, 1, 45, 5, 30000.0, 4, 1, 1, 50000.0]

# Age 70
F9 = [500, 0, 1, 70, 5, 120000.0, 1, 1, 1, 175000.0]
F10 = [500, 0, 1, 70, 5, 30000.0, 1, 1, 1, 50000.0]
F11 = [500, 0, 1, 70, 5, 120000.0, 4, 1, 1, 175000.0]
F12 = [500, 0, 1, 70, 5, 30000.0, 4, 1, 1, 50000.0]

# Age 95
F13 = [500, 0, 1, 95, 5, 120000.0, 1, 1, 1, 175000.0]
F14 = [500, 0, 1, 95, 5, 30000.0, 1, 1, 1, 50000.0]
F15 = [500, 0, 1, 95, 5, 120000.0, 4, 1, 1, 175000.0]
F16 = [500, 0, 1, 95, 5, 30000.0, 4, 1, 1, 50000.0]


# # all_features = np.array([F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20])

# #most likely 1
# all_features= np.array([
#     [480, 2.0, 1, 30, 1, 150000.0, 3, 0, 0, 180000.0],  # Germany, inactive, high balance, no card
#     [500, 2.0, 0, 27, 2, 140000.0, 2, 0, 0, 160000.0],  # Female, high balance, low tenure
#     [510, 2.0, 1, 29, 3, 130000.0, 4, 0, 0, 170000.0],  # Many products, inactive
#     [450, 1.0, 0, 26, 1, 120000.0, 2, 0, 0, 190000.0],  # Young, low credit score, high salary
#     [499, 0.0, 1, 32, 1, 160000.0, 2, 0, 0, 200000.0],  # French male with red flags
# ])
# #most likely 0
# # all_features = np.array([
# #     [850, 0.0, 1, 35, 5,     0.0,     1, 1, 1, 70000.0],   # Very high score, no balance
# #     [820, 1.0, 0, 45, 7,     0.0,     1, 1, 1, 65000.0],   # Older Spanish woman, low balance
# #     [800, 0.0, 1, 40, 6,     1000.0,  1, 1, 1, 90000.0],   # Stable user
# #     [780, 1.0, 1, 50, 8,     0.0,     1, 1, 1, 100000.0],  # High salary, active
# #     [770, 0.0, 0, 42, 10,    500.0,   1, 1, 1, 80000.0],   # Long tenure, low complexity
# #     [760, 1.0, 0, 36, 4,     300.0,   1, 1, 1, 120000.0],  # Simple, active user
# #     [750, 0.0, 1, 38, 6,     0.0,     1, 1, 1, 75000.0],   # Clean, stable profile
# #     [740, 1.0, 0, 33, 3,     200.0,   1, 1, 1, 68000.0],   # Lower balance, fewer years
# # ])

# scaled_features = scaler.transform(all_features)
# predictions = logRegres.predict(scaled_features)
# print(predictions)



In [50]:
from sklearn.preprocessing import StandardScaler
import csv
import numpy as np

Geography = {"France": 0.0, "Spain": 1.0, "Germany": 2.0}
Gender = {"Female": 0.0, "Male": 1}

test_data = []

with open('smaller_churn_test_data_alternating_balance.csv', 'r') as read_obj:
    csv_reader = csv.reader(read_obj)
    next(csv_reader)  # Skip header

    for row in csv_reader:
        row = row[3:]  # Skip first 3 columns

        row[1] = Geography[row[1]]  # Geography
        row[2] = Gender[row[2]]     # Gender

        test_data.append(row)

# Convert to NumPy array and float
test_data = np.array(test_data).astype(np.float32)

# Apply the same scaler used during training
scaled_test_data = scaler.transform(test_data)

# Make predictions
y_pred_log = logRegres.predict(scaled_test_data)
y_pred_rf  = clf.predict(test_data)  # Random Forest doesn't need scaling

print("Logistic Regression predictions:\n", y_pred_log)
print("Random Forest predictions:\n", y_pred_rf)

Logistic Regression predictions:
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1

In [51]:
# Age 20
print("Age 20: Users with different balances, salaries, and product counts")
age_20 = np.array([
    [500, 0, 1, 20, 5, 120000.0, 1, 1, 1, 175000.0],
    [500, 0, 1, 20, 5, 30000.0, 1, 1, 1, 50000.0],
    [500, 0, 1, 20, 5, 120000.0, 4, 1, 1, 175000.0],
    [500, 0, 1, 20, 5, 30000.0, 4, 1, 1, 50000.0]
])
print(clf.predict(age_20))

# Age 45
print("Age 45: Users with different balances, salaries, and product counts")
age_45 =  np.array([
    [500, 0, 1, 45, 5, 120000.0, 1, 1, 1, 175000.0],
    [500, 0, 1, 45, 5, 30000.0, 1, 1, 1, 50000.0],
    [500, 0, 1, 45, 5, 120000.0, 4, 1, 1, 175000.0],
    [500, 0, 1, 45, 5, 30000.0, 4, 1, 1, 50000.0]
])
print(clf.predict(age_45))

# Age 70
print("Age 70: Users with different balances, salaries, and product counts")
age_70 =  np.array([
    [500, 0, 1, 70, 5, 120000.0, 1, 1, 1, 175000.0],
    [500, 0, 1, 70, 5, 30000.0, 1, 1, 1, 50000.0],
    [500, 0, 1, 70, 5, 120000.0, 4, 1, 1, 175000.0],
    [500, 0, 1, 70, 5, 30000.0, 4, 1, 1, 50000.0]
])
print(clf.predict(age_70))

# Age 95
print("Age 95: Users with different balances, salaries, and product counts")
age_95 =  np.array([
    [500, 0, 1, 95, 5, 120000.0, 1, 1, 1, 175000.0],
    [500, 0, 1, 95, 5, 30000.0, 1, 1, 1, 50000.0],
    [500, 0, 1, 95, 5, 120000.0, 4, 1, 1, 175000.0],
    [500, 0, 1, 95, 5, 30000.0, 4, 1, 1, 50000.0]
])
print(clf.predict(age_95))

Age 20: Users with different balances, salaries, and product counts
[0. 0. 1. 0.]
Age 45: Users with different balances, salaries, and product counts
[0. 0. 1. 1.]
Age 70: Users with different balances, salaries, and product counts
[0. 0. 1. 1.]
Age 95: Users with different balances, salaries, and product counts
[0. 0. 1. 0.]
