# COGS 118B - Final Project

In [None]:
#import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv('CreditCard_dataset.csv')

#drop customer ID column
data.drop(columns=['CUST_ID'], inplace=True)

#replace null values with zeros
data['MINIMUM_PAYMENTS'].fillna(0, inplace=True)
data

In [None]:
null_count = data.isna().sum()

null_count

In [None]:
data.dropna(inplace=True)
data

In [None]:
null_count = data.isna().sum()

null_count

In [None]:
data.columns

In [None]:
corr = data.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask)

In [None]:
# Set the size of the plot
plt.figure(figsize=(10, 6))

# Create a histogram for the 'BALANCE' column
sns.histplot(data['BALANCE'], bins=30, kde=True)

# Add a title and labels to the plot
plt.title('Distribution of Customer Balances')
plt.xlabel('Balance')
plt.ylabel('Frequency')

# Display the plot
plt.show()

In [None]:
# Set the size of the plot
plt.figure(figsize=(10, 6))

# Create a histogram for the 'PAYMENTS' column
sns.histplot(data['PAYMENTS'], bins=30, kde=True)

# Add a title and labels to the plot
plt.title('Distribution of Customer Payments')
plt.xlabel('Payments')
plt.ylabel('Frequency')

# Display the plot
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import GridSearchCV

sum_of_squared_distances = []
K = range(1,30)
for k in K:
    km = KMeans(n_clusters=k, n_init=10)
    km = km.fit(data)
    sum_of_squared_distances.append(km.inertia_)

plt.plot(K, sum_of_squared_distances, marker='o', linestyle='--')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Optimal k selection')
plt.show()

In [None]:
def distortion_measure(points: np.ndarray, centroids: np.ndarray) -> float:
    distortion = 0
    for pt in points.values:
        dists = []
        for c in centroids:
            dists.append(np.sum((pt - c) ** 2) ** 0.5)

        distortion += min(dists)
    return distortion

In [None]:
distortion_measure_error = []
K = range(1,30)
for k in K:
    km = KMeans(n_clusters=k, n_init=10)
    km = km.fit(data)
    score = distortion_measure(data, km.cluster_centers_)
    distortion_measure_error.append(score)

plt.plot(K, distortion_measure_error, linestyle='--', marker='o')
plt.xlabel('k')
plt.ylabel('Distortion Measure')
plt.title('Optimal k selection')
plt.show() 

In [None]:
scaler = StandardScaler()
scale_data = scaler.fit_transform(data)
scale_data = pd.DataFrame(scale_data, columns=data.columns)
scale_data

In [None]:
sum_of_squared_distances = []
K = range(1,30)
for k in K:
    km = KMeans(n_clusters=k, n_init=10)
    km = km.fit(scale_data)
    sum_of_squared_distances.append(km.inertia_)

plt.plot(K, sum_of_squared_distances, marker='o', linestyle='--')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Optimal k selection')
plt.show()

In [None]:
distortion_measure_error = []
K = range(1,30)
for k in K:
    km = KMeans(n_clusters=k, n_init=10)
    km = km.fit(scale_data)
    score = distortion_measure(data, km.cluster_centers_)
    distortion_measure_error.append(score)

plt.plot(K, distortion_measure_error, linestyle='--', marker='o')
plt.xlabel('k')
plt.ylabel('Distortion Measure')
plt.title('Optimal k selection')
plt.show() 