In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('data')
df

In [None]:
df.info()

In [None]:
df['Dt_Customer']

In [None]:
df_copy = pd.read_csv('data', parse_dates=['Dt_Customer'])
df_copy

In [None]:
df_copy['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='%d-%m-%Y')
df_copy.info()

In [None]:
df_copy.describe().T

In [None]:
df_copy.isna().sum()

In [None]:
df_copy = df_copy.dropna()

In [None]:
df_copy = df_copy.sort_values(by=['Dt_Customer'], ignore_index=True)
df_copy

In [None]:
df_copy['Age'] = 2024-df_copy['Year_Birth']
df_copy

In [None]:
mnt_cols = [col for col in df_copy.columns if col.startswith('Mnt')]
mnt_cols

In [None]:
df_copy['Total_Amount'] = df_copy[mnt_cols].sum(axis=1)

In [None]:
purc_cols = [col for col in df_copy.columns if col.endswith('Purchases')]
purc_cols

In [None]:
df_copy['Total_Purchase'] = df_copy[purc_cols].sum(axis=1)

In [None]:
df_copy = df_copy.drop(['ID', 'Year_Birth'], axis=1)

In [None]:
df_copy.hist(grid=False, bins=20, figsize=(12, 10))
plt.tight_layout();

In [None]:
df_copy[['Dt_Customer', 'Total_Amount']].set_index('Dt_Customer').plot(figsize=(10, 8))

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df_copy.corr(numeric_only=True), annot = True)

In [None]:
df_copy = df_copy.drop('Dt_Customer', axis=1)

In [None]:
num_cols = [col for col in df_copy.columns if pd.api.types.is_numeric_dtype(df_copy[col])]
cat_cols = [col for col in df_copy.columns if col not in num_cols]

In [None]:
cat_cols

In [None]:
df_copy['Education'].nunique()

In [None]:
scaler = StandardScaler()
onehot = OneHotEncoder()

preprocessor = ColumnTransformer([('numeric', scaler, num_cols), ('categorical', onehot, cat_cols)])

df_copy_scaled = preprocessor.fit_transform(df_copy)

In [None]:
df_copy_scaled

In [None]:
df_copy_scaled.shape

In [None]:
df_copy_ = pd.DataFrame(df_copy_scaled, columns=preprocessor.get_feature_names_out())

In [None]:
scores_1 = []

range_values = range(1, 20)

for i in range_values:
  kmeans = KMeans(n_clusters = i)
  kmeans.fit(df_copy_scaled)
  scores_1.append(kmeans.inertia_)

plt.plot(scores_1, 'bx-')
plt.xlabel('Clusters')
plt.ylabel('Scores')

In [None]:
kmeans = KMeans(10)
kmeans.fit(df_copy_scaled)
labels = kmeans.labels_

In [None]:
kmeans.cluster_centers_.shape

In [None]:
# cluster_centers = pd.DataFrame(data = kmeans.cluster_centers_, columns = [df_copy_.columns])
# cluster_centers

In [None]:
y_kmeans = kmeans.fit_predict(df_copy_scaled)
y_kmeans

In [None]:
df_copy_cluster = pd.concat([df_copy_, pd.DataFrame({'cluster':labels})], axis = 1)
df_copy_cluster

In [None]:
for i in df_copy_.columns:
  plt.figure(figsize = (30, 5))
  for j in range(8):
    plt.subplot(1,8,j+1)
    cluster = df_copy_cluster[df_copy_cluster['cluster'] == j]
    cluster[i].hist(bins = 20)
    plt.title('{}    \nCluster {} '.format(i,j))

In [None]:
import pickle

pickle.dump(kmeans, open("data", "wb"))