### Practice Exercise (Data Preprocessing)

In [None]:
# Import data processing and ML preprocessing tools
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# Load car insurance dataset and check its structure and missing values
df = pd.read_csv('car_insurance.csv')
print(f'Shape of the Dataset:\n {df.shape}\n')
print(f'Dataset:\n {df.head(1)}\n')

print(f'Missing Values:\n {df.isnull().sum()}\n')

# Extract features (drop ID, MaritalStatus, Education) and target (ClaimAmount)
X = df.iloc[:,1:-1]
X = X.drop(['MaritalStatus','Education'],axis=1).values

y = df[['ClaimAmount']].values

# Define which columns are numerical (int) and categorical (str) for appropriate imputation
int_col = [0,3,4,7,8,9,10]
str_col = [1,2,5,6,11]

# Fill missing numerical values with mean, categorical values with most frequent
impute_mean = SimpleImputer(missing_values=np.nan,strategy='mean')
X[:,int_col] = impute_mean.fit_transform(X[:,int_col])

y = impute_mean.fit_transform(y)

impute_frequent = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X[:,str_col] = impute_frequent.fit_transform(X[:,str_col])

Shape of the Dataset:
 (25, 16)

Dataset:
   PolicyID   Age Gender  ... DrivingExperience Region ClaimAmount
0     P001  35.0   Male  ...              10.0  Urban      2500.0

[1 rows x 16 columns]

Missing Values:
 PolicyID             0
Age                  4
Gender               3
MaritalStatus        0
Education            0
Occupation           0
Income               2
VehicleAge           3
VehicleType          0
FuelType             5
EngineSize           1
Mileage              1
PreviousClaims       1
DrivingExperience    2
Region               0
ClaimAmount          3
dtype: int64



In [None]:
# Convert categorical text columns to numerical format using OneHotEncoder
ct = ColumnTransformer(transformers=[('Encode',OneHotEncoder(),str_col)],remainder='passthrough')

X = np.array(ct.fit_transform(X))

In [None]:
# Split data: 70% for training (model learns), 30% for testing (model evaluation)
X_train, X_test ,y_train,  y_test = train_test_split(X,y,test_size=0.3,random_state=69)
X_train

array([[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 45.0, 82000.0, 2.0, 1.8,
        32000.0, 0.0, 20.0],
       [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0,
        0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 37.80952380952381, 48000.0,
        2.0, 1.2, 22000.0, 0.0, 5.0],
       [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
        0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 32.0, 88000.0,
        3.6818181818181817, 2.5, 28000.0, 1.0, 8.0],
       [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
        0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 29.0, 78000.0, 1.0, 1.4,
        18000.0, 0.0, 6.0],
   

In [None]:
# Standardize the last 7 features to mean=0 and std=1 for equal feature contribution to the model
sc = StandardScaler()
X_train[:,-7:] = sc.fit_transform(X_train[:,-7:])

X_test[:,-7:] = sc.transform(X_test[:,-7:])
X_test

array([[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 2.317635813395896,
        -0.07624405688074296, 4.334688078995431, -0.061230036682127315,
        4.515184756233843, 4.530915699976328, 2.598178058231066],
       [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
        0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, -0.6260579018924216,
        -0.5165836043511299, -0.4208805380309438, -0.8240347684777873,
        -0.36420775152223434, -0.604122093330177, -0.5155653588134753],
       [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.246147643378191,
        -0.7521140599748253, 0.5302331853743312, -0.4426324025799571,
        0.4062226444392

In [None]:
# Display the final preprocessed scaled data ready for machine learning model
print(f'Scaled X_train set:\n {X_train} \n')
print(f'Scaled X_test set:\n {X_test}\n')
print(f'y_train set:\n {y_train}')

Scaled X_train set:
 [[1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0
  0.9003018023311504 0.05541893073498715 -0.4208805380309438
  -0.061230036682127315 -0.23580268552865336 -0.604122093330177
  0.868320604317432]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
  0.11635515152244515 -1.0885861394372471 -0.4208805380309438
  -1.2054371343756172 -0.5568153505126059 -0.604122093330177
  -0.8615368495962021]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0
  -0.517032208733595 0.25730217841244024 0.3789196384689465
  1.2736782439602774 -0.36420775152223434 0.42288546533112387
  -0.5155653588134753]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.