In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine, text
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
user = "aabounak"
password = "mysecretpassword"
host = "localhost"
port = "5432"
database = "piscineds"

def connect_to_postgres():
  return create_engine(
    url=f"postgresql://{user}:{password}@{host}:{port}/{database}"
  )

In [None]:
engine = connect_to_postgres()

In [None]:
with engine.connect() as connection:
  result = connection.execute(text('select * from customers limit 10000000'))
  data = result.fetchall()

columns_names = result.keys()
df = pd.DataFrame(data, columns=columns_names)
df.head()

In [None]:
sns.set_style("darkgrid")

In [None]:
df['event_time'] = pd.to_datetime(df['event_time'])
purchase_data = df[(df['event_type'] == 'purchase')]

In [None]:
purchase_data = purchase_data.copy()

grouped_data = purchase_data.groupby('user_id').size().reset_index(name='purchases')
freq = grouped_data[grouped_data['purchases'] < 30].sort_values(by='purchases', ascending=False)

In [None]:
data = freq.copy()
scaled_data = StandardScaler().fit_transform(data)

kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "random_state": 1,
}

sse = []
n_cluster = range(1, 11)
for k in n_cluster:
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(scaled_data)
    # inertia: Sum of squared distances of samples to their closest cluster center.
    sse.append(kmeans.inertia_)

plt.plot(n_cluster, sse)
plt.xlabel("Number of clusters")
plt.title("The Elbow Method")
plt.show()