**Import needed modules**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

**Read data**

In [None]:
df = pd.read_csv("/kaggle/input/mall-customers/Mall_Customers.csv")

In [None]:
df.head()

### **Exploratory Data Analysis**

**Univariate Analysis**

In [None]:
df.describe()

In [None]:
sns.distplot(df['Annual Income (k$)']);

In [None]:
df.columns


In [None]:
columns = ['Age', 'Annual Income (k$)','Spending Score (1-100)']
for i in columns:
    plt.figure()
    sns.distplot(df[i])

In [None]:
sns.kdeplot(df['Annual Income (k$)'],shade=True,hue=df['Gender']);

In [None]:
columns = ['Age', 'Annual Income (k$)','Spending Score (1-100)']
for i in columns:
    plt.figure()
    sns.kdeplot(df[i],shade=True,hue=df['Gender'])

In [None]:
columns = ['Age', 'Annual Income (k$)','Spending Score (1-100)']
for i in columns:
    plt.figure()
    sns.boxplot(data=df,x='Gender',y=df[i])

In [None]:
df['Gender'].value_counts(normalize=True)

**Bivariate Analysis**

In [None]:
sns.scatterplot(data=df, x='Annual Income (k$)',y='Spending Score (1-100)' )

In [None]:
sns.pairplot(df,hue='Gender')

In [None]:
df.groupby(['Gender'])['Age', 'Annual Income (k$)',
       'Spending Score (1-100)'].mean()

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr(),annot=True,cmap='coolwarm')

### **Clustering**

**Univariate**

In [None]:
clustering1 = KMeans(n_clusters=3)

In [None]:
clustering1.fit(df[['Annual Income (k$)']])

In [None]:
clustering1.labels_

In [None]:
df['Income Cluster'] = clustering1.labels_
df.head()

In [None]:
df['Income Cluster'].value_counts()

In [None]:
clustering1.inertia_

In [None]:
intertia_scores=[]
for i in range(1,11):
    kmeans=KMeans(n_clusters=i)
    kmeans.fit(df[['Annual Income (k$)']])
    intertia_scores.append(kmeans.inertia_)

In [None]:
intertia_scores

In [None]:
plt.plot(range(1,11),intertia_scores)

In [None]:
df.columns

In [None]:
df.groupby('Income Cluster')['Age', 'Annual Income (k$)',
       'Spending Score (1-100)'].mean()

**Bivariate**

In [None]:
clustering2 = KMeans(n_clusters=5)
clustering2.fit(df[['Annual Income (k$)','Spending Score (1-100)']])
df['Spending and Income Cluster'] =clustering2.labels_
df.head()

In [None]:
intertia_scores2=[]
for i in range(1,11):
    kmeans2=KMeans(n_clusters=i)
    kmeans2.fit(df[['Annual Income (k$)','Spending Score (1-100)']])
    intertia_scores2.append(kmeans2.inertia_)
plt.plot(range(1,11),intertia_scores2)

In [None]:
centers =pd.DataFrame(clustering2.cluster_centers_)
centers.columns = ['x','y']

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(x=centers['x'],y=centers['y'],s=100,c='black',marker='*')
sns.scatterplot(data=df, x ='Annual Income (k$)',y='Spending Score (1-100)',hue='Spending and Income Cluster',palette='tab10')
plt.savefig('clustering_bivaraiate.png')

In [None]:
pd.crosstab(df['Spending and Income Cluster'],df['Gender'],normalize='index')

In [None]:
df.groupby('Spending and Income Cluster')['Age', 'Annual Income (k$)',
       'Spending Score (1-100)'].mean()

**Multivariate**

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scale = StandardScaler()

In [None]:
df.head()

In [None]:
dff = pd.get_dummies(df,drop_first=True)
dff.head()

In [None]:
dff.columns

In [None]:
dff = dff[['Age', 'Annual Income (k$)', 'Spending Score (1-100)','Gender_Male']]
dff.head()

In [None]:
dff = scale.fit_transform(dff)

In [None]:
dff = pd.DataFrame(scale.fit_transform(dff))
dff.head()

In [None]:
intertia_scores3=[]
for i in range(1,11):
    kmeans3=KMeans(n_clusters=i)
    kmeans3.fit(dff)
    intertia_scores3.append(kmeans3.inertia_)
plt.plot(range(1,11),intertia_scores3)

In [None]:
df

In [None]:
df.to_csv('Clustering.csv')