In [1]:
# Feature Scaling & Encoding

# Objective: Learn to scale numerical features and encode categorical features for better model performance.
# Instructions:
# For each example, perform the following steps:
#     1. Load the Dataset: Load the dataset into your environment.
#     2. Feature Scaling: Apply scaling methods (StandardScaler or MinMaxScaler) to specified numerical columns.
#     3. Feature Encoding: Apply encoding methods (One-Hot Encoding or Label Encoding) to specified categorical columns.
#     4. Verify Changes: Check the data to ensure proper scaling and encoding. 


# Task:
#   Dataset: car_features.csv (get it by your own it includes the columns of Mileage , Horsepower)
    # Columns to scale: Mileage , Horsepower
    # Column to encode: Fuel_Type
    # Steps:
    #     1. Load car_features.csv .
    #     2. Scale Mileage and Horsepower using StandardScaler.
    #     3. Encode Fuel_Type using Label Encoding.
    #     4. Confirm scaling and encoding by checking these columns.
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- Step 1: Load the Dataset ---
# IMPORTANT: Replace this section with your actual code to load car_features.csv
# Example: df = pd.read_csv('car_features.csv')

# Creating a sample DataFrame that simulates car features data
# This is for demonstration purposes as I cannot access local files.
data = {
    'CarID': range(1, 101),
    'Mileage': np.random.randint(10, 50, size=100), # Mileage in km/l
    'Horsepower': np.random.randint(80, 300, size=100), # Horsepower
    'Fuel_Type': np.random.choice(['Petrol', 'Diesel', 'Electric', 'Hybrid'], size=100, p=[0.4, 0.3, 0.2, 0.1]) # Categorical Fuel Type
}
df = pd.DataFrame(data)

print("Original Data (first 5 rows):")
print(df.head())
print("\nOriginal Data Description:")
print(df.describe())
print("\nOriginal Data Info:")
print(df.info())
print(f"\nOriginal DataFrame shape: {df.shape}")

# Define the columns for scaling and encoding
numerical_features = ['Mileage', 'Horsepower']
categorical_features = ['Fuel_Type'] # Note: Label Encoding is for a single column typically

# --- Step 2: Scale Mileage and Horsepower using StandardScaler ---
# --- Step 3: Encode Fuel_Type using Label Encoding ---

# Create a ColumnTransformer to apply different transformations to different columns
# StandardScaler for numerical features, LabelEncoder for the categorical feature.
# Note: LabelEncoder is typically applied to a single column outside of ColumnTransformer
# or within a custom transformer if using ColumnTransformer for multiple steps.
# For simplicity here, we'll apply LabelEncoder separately after the ColumnTransformer
# handles the numerical features and passes through others.

# Create a ColumnTransformer for numerical features and passing through others
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features) # Apply StandardScaler to numerical features
    ],
    remainder='passthrough' # Keep other columns (like CarID and Fuel_Type)
)

# Create a pipeline that first preprocesses the data
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Apply the numerical transformations and pass through other columns
# The output is a numpy array
X_transformed_partial = pipeline.fit_transform(df)

# Get the names of the columns after the ColumnTransformer
# This includes scaled numerical features and the original 'Fuel_Type' and 'CarID'
transformed_partial_feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out(df.columns)

# Create a temporary DataFrame from the partially transformed data
df_transformed_partial = pd.DataFrame(X_transformed_partial, columns=transformed_partial_feature_names)

# --- Apply Label Encoding to the 'Fuel_Type' column ---
# LabelEncoder is typically used for target variables or when the order matters.
# For a nominal feature like 'Fuel_Type', One-Hot Encoding is usually preferred,
# but the task specifically requested Label Encoding.
label_encoder = LabelEncoder()

# Find the original 'Fuel_Type' column in the partially transformed DataFrame
# It will be named based on the remainder='passthrough' and original column name.
# We need to drop the 'remainder__' prefix added by ColumnTransformer for clarity.
# Find the column name that corresponds to the original 'Fuel_Type'
fuel_type_col_name = [col for col in transformed_partial_feature_names if col.endswith('Fuel_Type')][0]

# Apply Label Encoding to this column
df_transformed_partial[fuel_type_col_name] = label_encoder.fit_transform(df_transformed_partial[fuel_type_col_name])

# Rename the encoded column for clarity
df_transformed_partial = df_transformed_partial.rename(columns={fuel_type_col_name: 'Fuel_Type_LabelEncoded'})

# --- Step 4: Confirm scaling and encoding by checking these columns ---

print("\nTransformed Data (first 5 rows):")
print(df_transformed_partial.head())
print("\nTransformed Data Description:")
# Describe the transformed data to see the effect of scaling and encoding
print(df_transformed_partial.describe())
print("\nTransformed Data Info:")
print(df_transformed_partial.info())
print(f"\nTransformed DataFrame shape: {df_transformed_partial.shape}")

# --- Interpretation ---
# Observe the 'num__Mileage' and 'num__Horsepower' columns in the transformed data.
# Their values should now be scaled with a mean close to 0 and a standard deviation close to 1
# (characteristics of StandardScaler).
# Observe the 'Fuel_Type_LabelEncoded' column. Its values will be integers (0, 1, 2, 3)
# representing the different fuel types. The mapping of integer to original category
# is stored in label_encoder.classes_.
# The 'remainder__CarID' column should be present and unchanged.



    
    
    

Original Data (first 5 rows):
   CarID  Mileage  Horsepower Fuel_Type
0      1       26         232    Petrol
1      2       13         102    Petrol
2      3       16          82    Petrol
3      4       11         256    Petrol
4      5       13         132    Petrol

Original Data Description:
            CarID    Mileage  Horsepower
count  100.000000  100.00000  100.000000
mean    50.500000   29.12000  190.420000
std     29.011492   11.57259   62.016808
min      1.000000   10.00000   82.000000
25%     25.750000   20.75000  135.750000
50%     50.500000   29.50000  191.500000
75%     75.250000   39.25000  238.500000
max    100.000000   49.00000  294.000000

Original Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   CarID       100 non-null    int64 
 1   Mileage     100 non-null    int64 
 2   Horsepower  100 non-null    int64 
 3   Fuel