In [1]:
# Feature Scaling & Encoding

# Objective: Learn to scale numerical features and encode categorical features for better model performance.
# Instructions:
# For each example, perform the following steps:
#     1. Load the Dataset: Load the dataset into your environment.
#     2. Feature Scaling: Apply scaling methods (StandardScaler or MinMaxScaler) to specified numerical columns.
#     3. Feature Encoding: Apply encoding methods (One-Hot Encoding or Label Encoding) to specified categorical columns.
#     4. Verify Changes: Check the data to ensure proper scaling and encoding. 


# Task:
#     Dataset: customer_data.csv (get it by your own it includes the columns of Age , Annual_Income)
#     Columns to scale: Age , Annual_Income
#     Column to encode: Region
#     Steps:
#         1. Load customer_data.csv .
#         2. Use MinMaxScaler on Age and Annual_Income .
#         3. Perform One-Hot Encoding on Region .
#         4. Verify by assessing the transformed dataset.



import pandas as pd

# Create the sample dataset
data = {
    'Age': [25, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Annual_Income': [40000, 50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000],
    'Region': ['East', 'West', 'North', 'South', 'East', 'West', 'North', 'South', 'East', 'West']
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('customer_data.csv', index=False)

print("CSV file 'customer_data.csv' created successfully.")

# Task:
#     Dataset: customer_data.csv (get it by your own it includes the columns of Age , Annual_Income)
#     Columns to scale: Age , Annual_Income
#     Column to encode: Region
#     Steps:
#         1. Load customer_data.csv .
#         2. Use MinMaxScaler on Age and Annual_Income .
#         3. Perform One-Hot Encoding on Region .
#         4. Verify by assessing the transformed dataset.


import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Load the dataset
df = pd.read_csv('customer_data.csv')

# Display the first few rows of the dataset to inspect it
print("Original DataFrame:")
print(df.head())

# Step 2: Apply MinMaxScaler on Age and Annual_Income columns
scaler = MinMaxScaler()

# Apply the scaler to the specified columns
df[['Age', 'Annual_Income']] = scaler.fit_transform(df[['Age', 'Annual_Income']])

# Step 3: Apply One-Hot Encoding on Region column
# Use pandas get_dummies to perform One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=['Region'], drop_first=False)

# Step 4: Verify the changes by inspecting the transformed dataset
print("\nTransformed DataFrame:")
print(df_encoded.head())

# Optionally, display summary statistics to verify scaling
print("\nSummary Statistics after Scaling:")
print(df_encoded[['Age', 'Annual_Income']].describe())    
    
    

CSV file 'customer_data.csv' created successfully.
Original DataFrame:
   Age  Annual_Income Region
0   25          40000   East
1   30          50000   West
2   35          60000  North
3   40          70000  South
4   45          80000   East

Transformed DataFrame:
        Age  Annual_Income  Region_East  Region_North  Region_South  \
0  0.000000       0.000000         True         False         False   
1  0.111111       0.111111        False         False         False   
2  0.222222       0.222222        False          True         False   
3  0.333333       0.333333        False         False          True   
4  0.444444       0.444444         True         False         False   

   Region_West  
0        False  
1         True  
2        False  
3        False  
4        False  

Summary Statistics after Scaling:
             Age  Annual_Income
count  10.000000      10.000000
mean    0.500000       0.500000
std     0.336406       0.336406
min     0.000000       0.000000
25%    