In [1]:
import pandas as pd
import chardet

In [2]:
file_path = 'shopping_data.csv'
df_shopping = pd.read_csv(file_path, encoding="ISO-8859-1")
df_shopping.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [3]:
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [4]:
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [5]:
#Find Null Values
for column in df_shopping.columns:
    print(f"Column {column} has {df_shopping[column].isnull().sum()} null values")

Column CustomerID has 0 null values
Column Card Member has 2 null values
Column Age has 2 null values
Column Annual Income has 0 null values
Column Spending Score (1-100) has 1 null values


In [6]:
#Drop null rows
df_shopping = df_shopping.dropna()

In [7]:
#check for duplicate rows
print(f' Duplicated Rows: {df_shopping.duplicated().sum()}')

 Duplicated Rows: 0


In [8]:
#remove CustomerID column
df_shopping.drop(columns=['CustomerID'], inplace=True)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [9]:
#convert card member yes and no to 1 and 0
def change_string(member):
    if member == "Yes":
        return 1
    else:
        return 0

df_shopping["Card Member"] = df_shopping["Card Member"].apply(change_string)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [10]:
#scale the income column by dividing by 1000
df_shopping["Annual Income"] = df_shopping["Annual Income"] / 1000
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [11]:
df_shop = df_shopping.rename(columns = {'Card Member':'card_member', 'Age': 'age', 'Annual Income': 'annual_income', 'Spending Score (1-100)': 'spending_score'}, inplace=False)
print(df_shop)

     card_member   age  annual_income  spending_score
0              1  19.0           15.0            39.0
1              1  21.0           15.0            81.0
2              0  20.0           16.0             6.0
3              0  23.0           16.0            77.0
4              0  31.0           17.0            40.0
..           ...   ...            ...             ...
198            0  35.0          120.0            79.0
199            0  45.0          126.0            28.0
200            1  32.0          126.0            74.0
201            1  32.0          137.0            18.0
202            1  30.0          137.0            83.0

[200 rows x 4 columns]


In [12]:
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [None]:
# Saving cleaned data
file_path = "shopping_data_cleaned.csv"
df_shopping.to_csv(file_path, index=False)