In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load dataset
data = pd.read_csv('BankChurners.csv')

# Preprocess data
data_cleaned = data.drop(columns=['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 
                                  'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'])

# Drop rows with 'Unknown' values
data_cleaned = data_cleaned[~data_cleaned['Education_Level'].isin(['Unknown'])]
data_cleaned = data_cleaned[~data_cleaned['Income_Category'].isin(['Unknown'])]
data_cleaned = data_cleaned[~data_cleaned['Marital_Status'].isin(['Unknown'])]

# Convert 'Gender' to numeric (M -> 1, F -> 0)
data_cleaned['Gender'] = data_cleaned['Gender'].map({'M': 1, 'F': 0})

# Convert 'Attrition_Flag' to binary target variable 'Churn'
data_cleaned['Churn'] = data_cleaned['Attrition_Flag'].apply(lambda x: 1 if x == 'Attrited Customer' else 0)

# Drop the 'Attrition_Flag' column as we now have 'Churn'
data_cleaned = data_cleaned.drop(columns=['Attrition_Flag'])

# Binning 'Customer_Age' into Age Group
age_bins = [18, 30, 50, 100]
age_labels = ['Young', 'Middle-Aged', 'Senior']
data_cleaned['Age_Group'] = pd.cut(data_cleaned['Customer_Age'], bins=age_bins, labels=age_labels, right=False)

# Create additional features: Avg_Open_To_Buy and Avg_Utilization_Ratio
data_cleaned['Avg_Open_To_Buy'] = data_cleaned['Credit_Limit'] - data_cleaned['Total_Revolving_Bal']
data_cleaned['Avg_Utilization_Ratio'] = data_cleaned['Total_Revolving_Bal'] / data_cleaned['Credit_Limit']

# Handle categorical variables using one-hot encoding
data_cleaned_encoded = pd.get_dummies(data_cleaned, drop_first=True)

# VIF Calculation
numerical_cols = data_cleaned_encoded.select_dtypes(include=['float64', 'int64']).columns
X = data_cleaned_encoded[numerical_cols].drop(columns=['Churn'])

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print("VIF Data:")
print(vif_data[vif_data['VIF'] > 10])  # High VIF

# Drop collinear features based on VIF
data_cleaned_encoded_reduced = data_cleaned_encoded.drop(columns=['Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy'])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(data_cleaned_encoded_reduced.drop(columns=['Churn']), data_cleaned_encoded_reduced['Churn'], test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

# Predictions and Evaluation
y_pred = log_reg.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


VIF Data:
                 Feature        VIF
0           Customer_Age  75.872049
3         Months_on_book  56.943713
7           Credit_Limit        inf
8    Total_Revolving_Bal        inf
9        Avg_Open_To_Buy        inf
10  Total_Amt_Chng_Q4_Q1  13.694302
12        Total_Trans_Ct  22.857143
13   Total_Ct_Chng_Q4_Q1  11.786845
Accuracy: 0.9012

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1205
           1       0.73      0.54      0.62       212

    accuracy                           0.90      1417
   macro avg       0.83      0.75      0.78      1417
weighted avg       0.89      0.90      0.90      1417


Confusion Matrix:
[[1162   43]
 [  97  115]]


  vif = 1. / (1. - r_squared_i)


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import Clustering
from sklearn.cluster import KMeans, DBSCAN
# Import preprocessing for LabelEncoder en OneHotEncoder
from sklearn import preprocessing
# Import scikit-learn metrics module
from sklearn.metrics import adjusted_rand_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

# Display options dataframes
pd.set_option('display.width',400)
pd.set_option('display.max_columns', 40)
# Display options numpy arrays
np.set_printoptions(edgeitems=10)
np.core.arrayprint._line_width = 180

credit = pd.read_csv("/home/becode/LearnAI/scikit/scikit/Unsupervised Learning/datasets_14701_19663_CC GENERAL.csv")
print(credit.head(10))
print(credit.shape)
print(credit.describe())
print(credit.info())
print(credit.isna().sum())
# replace NA's in MINIMUM_PAYMENTS and CREDIT_LIMIT, respectively 10 and 1 NA's and check afterwards
credit.MINIMUM_PAYMENTS.fillna(credit.MINIMUM_PAYMENTS.mean(), inplace=True)
credit.CREDIT_LIMIT.fillna(credit.CREDIT_LIMIT.mean(),inplace=True)
print(credit.isna().sum())

# Customer ID is irrelevant for clustering users
credit = credit.drop('CUST_ID', axis =1)

# print correlation heat map (Pearson's coeff)
"""
corr=credit.corr()
top_features=corr.index
plt.figure(figsize=(20,20))
sns.heatmap(credit[top_features].corr(),annot=True)
plt.show()
"""
# Look at data spread
sns.boxplot(data=credit)
plt.show()

# Standardizing the data
X = credit # or X = np.asarray(credit)
stan = preprocessing.StandardScaler()
X = stan.fit_transform(X)

"""
# Elbow curve : inertia vs k
n_clusters=20
cost=[]
for i in range(1,n_clusters):
    kmeans= KMeans(i)
    kmeans.fit(X)
    cost.append(kmeans.inertia_)
plt.plot(range(1,20),cost)
plt.xticks(range(1,20,2))
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()
"""

# Silhouette curve : Silhouette vs k
"""
The silhouette coefficient can vary between -1 and +1: a coefficient close to +1 means that the instance is well 
inside its own cluster and far from other clusters, while a coefficient close to 0 means that it is close to a 
cluster boundary, and finally a coefficient close to -1 means that the instance may have been assigned to the wrong
cluster.
"""
"""
n_clusters=20
sil_scores=[]
for i in range(2,n_clusters):  # n_clusters can not be 1, took me a really long time to change the range from range(1, n) tp (2,n)
    kmeans = KMeans(i)
    labels = kmeans.fit_predict(X) # or kmeans.labels_ is the same
    sil_scores.append(silhouette_score(X,labels))
plt.plot(range(2,20),sil_scores)
plt.title('The Silhouette curve')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette')
plt.show()
"""
# Silhouette scores highest for k=3 but only marginal differnce, the score is too close to 0, 1 means good clustering

# Call KMeans and calculate silhouette score, can't calculate adjsuetd rand 'cause I don't have an actual y)
# Only 3 clusters
kmeans = KMeans(n_clusters=3, init='k-means++',n_init=10, max_iter=300, random_state=123 )
kmeans.fit_predict(X)
print(f"silhouette score = {silhouette_score(X,kmeans.labels_)}")
# I think I really need to take a look at the data and reorganize it, arrange the outliers !

# Facetgrid Plots per clusters:
"""
clusters = credit
clusters['Clusters'] = kmeans.labels_  # add column with cluster labels to a copy of our dataframe
print(clusters.head(10))
i=0
fig1 = plt.figure()
## number of co??
for column in clusters:
    i += 1
    grid = sns.FacetGrid(clusters, col='Clusters')
    grid.map(plt.hist,column)
    ax = fig1.add_subplot(len(clusters.columns.tolist()),1, i)
plt.show()
"""

# PCA to visualize clusters, my clustering is poor
dist = 1 - cosine_similarity(X)

pca = PCA(2)
pca.fit(dist)
X_PCA = pca.transform(dist)
print(X_PCA.shape)
x, y = X_PCA[:, 0], X_PCA[:, 1]

labels = kmeans.labels_

colors = {0: 'red',
          1: 'blue',
          2: 'green'}

names = {0: 'who make all type of purchases',
         1: 'more people with due payments',
         2: 'who purchases mostly in installments'}

df = pd.DataFrame({'x': x, 'y': y, 'label': labels})
groups = df.groupby('label')

fig, ax = plt.subplots(figsize=(20, 13))

for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=5,
            color=colors[name], label=names[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off')
    ax.tick_params(axis='y', which='both', left='off', top='off', labelleft='off')

ax.legend()
ax.set_title("Customers Segmentation based on their Credit Card usage bhaviour.")
plt.show()





"""
columns = ['BALANCE', 'PURCHASES', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'CREDIT_LIMIT',
           'PAYMENTS', 'MINIMUM_PAYMENTS']

for c in columns:
    Range = c + '_RANGE'
    data[Range] = 0
    data.loc[((data[c] > 0) & (data[c] <= 500)), Range] = 1
    data.loc[((data[c] > 500) & (data[c] <= 1000)), Range] = 2
    data.loc[((data[c] > 1000) & (data[c] <= 3000)), Range] = 3
    data.loc[((data[c] > 3000) & (data[c] <= 5000)), Range] = 4
    data.loc[((data[c] > 5000) & (data[c] <= 10000)), Range] = 5
    data.loc[((data[c] > 10000)), Range] = 6

columns = ['BALANCE_FREQUENCY', 'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
           'CASH_ADVANCE_FREQUENCY', 'PRC_FULL_PAYMENT']

for c in columns:
    Range = c + '_RANGE'
    data[Range] = 0
    data.loc[((data[c] > 0) & (data[c] <= 0.1)), Range] = 1
    data.loc[((data[c] > 0.1) & (data[c] <= 0.2)), Range] = 2
    data.loc[((data[c] > 0.2) & (data[c] <= 0.3)), Range] = 3
    data.loc[((data[c] > 0.3) & (data[c] <= 0.4)), Range] = 4
    data.loc[((data[c] > 0.4) & (data[c] <= 0.5)), Range] = 5
    data.loc[((data[c] > 0.5) & (data[c] <= 0.6)), Range] = 6
    data.loc[((data[c] > 0.6) & (data[c] <= 0.7)), Range] = 7
    data.loc[((data[c] > 0.7) & (data[c] <= 0.8)), Range] = 8
    data.loc[((data[c] > 0.8) & (data[c] <= 0.9)), Range] = 9
    data.loc[((data[c] > 0.9) & (data[c] <= 1.0)), Range] = 10

columns = ['PURCHASES_TRX', 'CASH_ADVANCE_TRX']

for c in columns:
    Range = c + '_RANGE'
    data[Range] = 0
    data.loc[((data[c] > 0) & (data[c] <= 5)), Range] = 1
    data.loc[((data[c] > 5) & (data[c] <= 10)), Range] = 2
    data.loc[((data[c] > 10) & (data[c] <= 15)), Range] = 3
    data.loc[((data[c] > 15) & (data[c] <= 20)), Range] = 4
    data.loc[((data[c] > 20) & (data[c] <= 30)), Range] = 5
    data.loc[((data[c] > 30) & (data[c] <= 50)), Range] = 6
    data.loc[((data[c] > 50) & (data[c] <= 100)), Range] = 7
    data.loc[((data[c] > 100)), Range] = 8
"""
# from sklearn.metrics import silhouette_score
# silhouette_score(X, kmeans.labels_)

  np.core.arrayprint._line_width = 180


FileNotFoundError: [Errno 2] No such file or directory: '/home/becode/LearnAI/scikit/scikit/Unsupervised Learning/datasets_14701_19663_CC GENERAL.csv'