## Import Libraries

In [4]:
import pandas as pd

## Loading the Data

In [5]:
df = pd.read_csv('datasets/expenses.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Data Cleaning

### Todo List

- [x] Check for missing values
- [x] Examine data types and convert if necessary
- [x] Remove duplicate entries
- [x] Handle outliers in numerical columns
- [x] Encode categorical variables
- [x] Normalize or standardize numerical features

In [None]:
# Missing values
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [None]:
# Data types
print("Data types:")
print(df.dtypes)

Data types:
age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object


In [None]:
# Convert binary variables to 0/1
df['sex'] = df['sex'].map({'male': 1, 'female': 0})
df['smoker'] = df['smoker'].map({'yes': 1, 'no': 0})

# Convert region to dummy variables
df = pd.get_dummies(df, columns=['region'], drop_first=True)

print("Encoded columns:")
print(df.columns.tolist())

In [8]:
# Remove duplicates
print(f"Original shape: {df.shape}")
df = df.drop_duplicates()
print(f"After removing duplicates: {df.shape}")

Original shape: (1338, 7)
After removing duplicates: (1337, 7)


In [13]:
# Handle outliers in numerical columns
import numpy as np

numerical_cols = ['age', 'bmi', 'children', 'charges']

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"{col}: {len(outliers)} outliers")

age: 0 outliers
bmi: 9 outliers
children: 0 outliers
charges: 139 outliers
