## 6.1 Feature Engineering
In this notebook, we will apply the feature engineering examples that we studied in class, namely normalization and standardization.

In [None]:
# Import necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# Open the dataset

import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("yasserh/housing-prices-dataset")
#path = kagglehub.dataset_download("ignacioazua/world-gdp-population-and-co2-emissions-dataset")

print("Path to dataset files:", path)

print("Path to dataset files:", path) # Path to the downloaded folder 
filename = os.listdir(path)
print(filename) # Shows content of the folder
#filepath=os.path.join(path, "World_GDP_Population_CO2_Emissions_Dataset.csv")
filepath=os.path.join(path, "Housing.csv")
print(filepath)

Path to dataset files: /home/cgraiff/.cache/kagglehub/datasets/yasserh/housing-prices-dataset/versions/1
Path to dataset files: /home/cgraiff/.cache/kagglehub/datasets/yasserh/housing-prices-dataset/versions/1
['Housing.csv']
/home/cgraiff/.cache/kagglehub/datasets/yasserh/housing-prices-dataset/versions/1/Housing.csv


In [6]:
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [None]:
# Log transformation, standardization and normalization only apply to numerical variables

numerical_vars = df.select_dtypes(include=[np.number])

print(numerical_vars)

        price  area  bedrooms  bathrooms  stories  parking
0    13300000  7420         4          2        3        2
1    12250000  8960         4          4        4        3
2    12250000  9960         3          2        2        2
3    12215000  7500         4          2        2        3
4    11410000  7420         4          1        2        2
..        ...   ...       ...        ...      ...      ...
540   1820000  3000         2          1        1        2
541   1767150  2400         3          1        1        0
542   1750000  3620         2          1        1        0
543   1750000  2910         3          1        1        0
544   1750000  3850         3          1        2        0

[545 rows x 6 columns]


#### 6.1.1 Log transform
- Helpful with skewed data, and helpful to reduce effect of outliers
- Values should be **positive**
- You can add 1 to the value to always have a positive result

In [9]:
numerical_vars["price_log"]=(numerical_vars["price"]+1).transform(np.log)
numerical_vars.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,price_log
0,13300000,7420,4,2,3,2,16.403275
1,12250000,8960,4,4,4,3,16.321037
2,12250000,9960,3,2,2,2,16.321037
3,12215000,7500,4,2,2,3,16.318175
4,11410000,7420,4,1,2,2,16.250001


##### How to handle negative values

In [None]:
# The above dataset does not contain negative values, so we will create a sample one

data = pd.DataFrame({'samples': [87, 45, 654, 69430, -12, -34, 6,5, 7,8954]})
data

Unnamed: 0,samples
0,87
1,45
2,654
3,69430
4,-12
5,-34
6,6
7,5
8,7
9,8954


In [None]:
# Create positive values and then transform

data["log"] = (data["samples"]-data["samples"].min()+1).transform(np.log)
data

Unnamed: 0,samples,log
0,87,4.804021
1,45,4.382027
2,654,6.535241
3,69430,11.148578
4,-12,3.135494
5,-34,0.0
6,6,3.713572
7,5,3.688879
8,7,3.73767
9,8954,9.103757


### 6.2 Normalization

Normalization scales all values between 0 and 1. 
> Outlier effect can increase, because in normalization **standard deviation also decreases**. If your dataset contains outliers, **make sure you handle them before**.

In [13]:
df["price_normalizes"] = (df["price"] - df["price"].min())/(df["price"].max() - df["price"].min())
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price_normalizes
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished,1.000000
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished,0.909091
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished,0.909091
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished,0.906061
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished,0.836364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished,0.006061
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished,0.001485
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished,0.000000
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished,0.000000


### 6.4 Standardization
Standardization **takes into account standard deviation**. Therefore, outlier effect diminishes.

In [16]:
df["price_standardized"] = (df["price"] - df["price"].mean())/(df["price"].std())
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price_normalizes,price_standardized
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished,1.000000,4.562174
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished,0.909091,4.000809
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished,0.909091,4.000809
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished,0.906061,3.982096
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished,0.836364,3.551716
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished,0.006061,-1.575421
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished,0.001485,-1.603676
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished,0.000000,-1.612845
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished,0.000000,-1.612845
