## Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Function


In [None]:
# Remove Outliers
def remove_outliers(df, col):
    Q1 = np.quantile(df[col], 0.25)
    Q3 = np.quantile(df[col], 0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - IQR * 1.5
    upper_bound = Q3 + IQR * 1.5
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

Import Data


In [None]:
# Load dataset
data = pd.read_csv("Mall_Customers.csv")

Explore Data


In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.describe()

In [None]:
data.dtypes

Cleaning of the Data


In [None]:
# Columns names need some changes
data.rename(
    columns={
        "CustomerID": "customer_id",
        "Age": "age",
        "Gender": "gender",
        "Annual Income (k$)": "annual_income",
        "Spending Score (1-100)": "spending_score",
    },
    inplace=True,  # to replace new names into Dataset
)
# convert income from k$  to $
data["annual_income"] = data["annual_income"] * 1000

In [None]:
data.info()

In [None]:
# Check for outliers
sns.boxplot(data = data,x= 'gender',y='annual_income')
print(f'M Mean = {data[data['gender'] == 'Male']['annual_income'].mean()}')
print(f'F Mean = {data[data['gender'] == 'Female']['annual_income'].mean()}')
plt.show()

In [None]:
sns.kdeplot(data, x="annual_income")
plt.show()

AS data are normally distribted and number of outliers is tiny, so I decided to keep them because they have meaningful meaning and small impact on insights


In [None]:
# Check for outliers
sns.boxplot(data = data,x= 'gender',y='spending_score')
print(f'M Mean = {data[data['gender'] == 'Male']['spending_score'].mean()}')
print(f'F Mean = {data[data['gender'] == 'Female']['spending_score'].mean()}')

### Visualization to investigate meaningful insights


In [None]:
sns.pairplot(data)
plt.show()

In [None]:
sns.countplot(data=data, x="gender")
print(data["gender"].value_counts())
plt.show()

In [None]:
# Age Distribution
sns.kdeplot(data, x="age")
plt.show()

Classify Age to be easy in Visualization


In [None]:
np.sort(data["age"].unique())

In [None]:
data["age"].replace([x for x in range(18, 26)], "18-25", inplace=True)
data["age"].replace([x for x in range(26, 36)], "26-35", inplace=True)
data["age"].replace([x for x in range(36, 46)], "36-45", inplace=True)
data["age"].replace([x for x in range(46, 56)], "46-55", inplace=True)
data["age"].replace([x for x in range(56, 71)], "55+", inplace=True)

In [None]:
sns.countplot(data, x="age")
plt.show()

Investigation of Annual Income


In [None]:
sns.kdeplot(data, x="annual_income")
plt.xlabel("Annual Income")
plt.title("Annual Income Distribution")
plt.show()

In [None]:
sns.barplot(data, x="age", y="annual_income")
plt.xlabel("Age")
plt.ylabel("Annual Income")
plt.title("Annual Income Distribution by Age Group")


plt.show()

In [None]:
sns.barplot(data=data, x="gender", y="annual_income", estimator="mean")
print(data.groupby("gender")["annual_income"].mean())
plt.xlabel("Gender")
plt.ylabel("Annual Income")
plt.title("Annual Income Distribution by Gender")
plt.show()

In [None]:
sns.scatterplot(data, x="spending_score", y="annual_income", hue="gender")
plt.xlabel("Spending Score")
plt.ylabel("Annual Income")
plt.title("Distribution of Annual Income & Spending Score")
plt.legend()
plt.show()

Investigation of Spending Score Column


In [None]:
sns.kdeplot(data, x="spending_score")
plt.xlabel("Spending Score")
plt.title("Spending Score Distribution")
plt.show()

In [None]:
sns.barplot(data, x="age", y="spending_score")
plt.xlabel("Age")
plt.ylabel("Spending Score")
plt.title("spending score Distribution by Age Group")


plt.show()

In [None]:
sns.barplot(data = data,x = 'gender',y = 'spending_score',estimator='mean')
print(f'{data.groupby('gender')['spending_score'].mean()}')
plt.xlabel('Gender')
plt.ylabel('Spending Score')
plt.title('Spending Score Distribution by Gender')
plt.show()