In [1]:
# Feature Scaling & Encoding

# Objective: Learn to scale numerical features and encode categorical features for better model performance.
# Instructions:
# For each example, perform the following steps:
#     1. Load the Dataset: Load the dataset into your environment.
#     2. Feature Scaling: Apply scaling methods (StandardScaler or MinMaxScaler) to specified numerical columns.
#     3. Feature Encoding: Apply encoding methods (One-Hot Encoding or Label Encoding) to specified categorical columns.
#     4. Verify Changes: Check the data to ensure proper scaling and encoding. 


# Task:
#     Dataset: customer_data.csv (get it by your own it includes the columns of Age , Annual_Income)
#     Columns to scale: Age , Annual_Income
#     Column to encode: Region
#     Steps:
#         1. Load customer_data.csv .
#         2. Use MinMaxScaler on Age and Annual_Income .
#         3. Perform One-Hot Encoding on Region .
#         4. Verify by assessing the transformed dataset.

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- Step 1: Load the Dataset ---
# IMPORTANT: Replace this section with your actual code to load customer_data.csv
# Example: df = pd.read_csv('customer_data.csv')

# Creating a sample DataFrame that simulates customer data with numerical and categorical features
# This is for demonstration purposes as I cannot access local files.
data = {
    'CustomerID': range(1, 101),
    'Age': np.random.randint(18, 65, size=100), # Ages between 18 and 65
    'Annual_Income': np.random.normal(loc=60000, scale=20000, size=100), # Simulate income with some variance
    'Region': np.random.choice(['North', 'South', 'East', 'West'], size=100, p=[0.3, 0.2, 0.25, 0.25]) # Categorical feature
}
df = pd.DataFrame(data)

# Ensure Annual_Income is positive (income cannot be negative)
df['Annual_Income'] = df['Annual_Income'].apply(lambda x: max(x, 10000)) # Set a minimum income

print("Original Data (first 5 rows):")
print(df.head())
print("\nOriginal Data Description:")
print(df.describe())
print("\nOriginal Data Info:")
print(df.info())
print(f"\nOriginal DataFrame shape: {df.shape}")

# Define the columns for scaling and encoding
numerical_features = ['Age', 'Annual_Income']
categorical_features = ['Region']

# --- Step 2: Use MinMaxScaler on Age and Annual_Income ---
# --- Step 3: Perform One-Hot Encoding on Region ---

# Create a ColumnTransformer to apply different transformations to different columns
# This is the recommended way to handle mixed data types in scikit-learn.
# transformers: list of tuples (name, transformer_object, columns_to_apply_to)
# remainder='passthrough': keeps the columns not specified in transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features), # Apply MinMaxScaler to numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features) # Apply OneHotEncoder to categorical features
    ],
    remainder='passthrough' # Keep other columns (like CustomerID)
)

# Create a pipeline that first preprocesses the data and then could be followed by a model
# Using a pipeline is good practice for chaining steps.
# Here, we just use the preprocessor step to show the transformation.
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Apply the transformations to the DataFrame
# fit_transform fits the transformers on the data and then transforms it.
# The output is a numpy array.
X_transformed = pipeline.fit_transform(df)

# --- Step 4: Verify by assessing the transformed dataset ---

# Get the names of the new columns created by the preprocessor
# This requires fitting the preprocessor first, which is done by the pipeline's fit_transform.
# We need to access the preprocessor step within the fitted pipeline.
transformed_feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out(df.columns)

# Create a new DataFrame from the transformed data with appropriate column names
df_transformed = pd.DataFrame(X_transformed, columns=transformed_feature_names)

print("\nTransformed Data (first 5 rows):")
print(df_transformed.head())
print("\nTransformed Data Description:")
# Describe the transformed data to see the effect of scaling and encoding
print(df_transformed.describe())
print("\nTransformed Data Info:")
print(df_transformed.info())
print(f"\nTransformed DataFrame shape: {df_transformed.shape}")

# --- Interpretation ---
# Observe the 'Age' and 'Annual_Income' columns in the transformed data.
# Their values should now be scaled between 0 and 1.
# Observe the new columns created for 'Region' (e.g., 'cat__Region_East', 'cat__Region_North', etc.).
# These are the one-hot encoded columns, where each row will have a 1 in the column
# corresponding to its original region and 0 in others.
# The 'remainder__CustomerID' column should be present and unchanged if remainder='passthrough'.




    
    
    

Original Data (first 5 rows):
   CustomerID  Age  Annual_Income Region
0           1   53   45089.846087   West
1           2   18   54265.388660   West
2           3   59   41984.666315  North
3           4   64   55339.601596  South
4           5   31   35971.049873  North

Original Data Description:
       CustomerID         Age  Annual_Income
count  100.000000  100.000000     100.000000
mean    50.500000   42.300000   60176.515177
std     29.011492   12.968883   20598.013550
min      1.000000   18.000000   10000.000000
25%     25.750000   32.000000   44737.849244
50%     50.500000   43.000000   61852.482997
75%     75.250000   53.000000   74392.238432
max    100.000000   64.000000  123689.374162

Original Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CustomerID     100 non-null    int64  
 1   Age            100 non-null   