## Data Preprocessing

### Import libraries

In [None]:
# Import data processing and ML preprocessing libraries
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder , StandardScaler
from sklearn.model_selection import train_test_split

### Loading the Dataset


In [None]:
# Load dataset and separate features (X) from target label (y)
df = pd.read_csv('Data.csv')
X = df.iloc[:,:-1].values  # All columns except the last
y = df.iloc[:,-1].values  # Only the last column (target)

### Identify the missing values and handling the missing values


In [None]:
# Check for missing values and fill them with mean strategy
missing_data = df.isnull().sum()
print(f'Missing Data:\n {missing_data}')
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')  # Use average value to fill gaps
X[:,1:3] = imputer.fit_transform(X[:,1:3])

Missing Data:
 Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


#### Encoding the independent variable

In [None]:
# Convert categorical feature (first column) to numerical format using OneHotEncoder
ct = ColumnTransformer(transformers=[('encode',OneHotEncoder(),[0])],remainder='passthrough')
X = ct.fit_transform(X)

#### Encoding the dependent variable

In [None]:
# Convert target labels (y) from text to numbers using LabelEncoder
lb = LabelEncoder()
y = lb.fit_transform(y)

### Split the data

In [None]:
# Split data: 80% training (learn), 20% testing (evaluate)
X_train, X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)

### feature Scaler

In [None]:
# Standardize features from column 3 onwards to mean=0, std=1 for better model learning
st = StandardScaler()

X_train[:,3:] = st.fit_transform(X_train[:,3:])

X_test[:,3:] = st.transform(X_test[:,3:])

In [None]:
# Display the final preprocessed training and testing datasets ready for model training
print(f'X_train : {X_train}\n')
print(f'X_test: {X_test}\n')
print(f'y_train{y_train}\n')
print(f'y_test{y_test}\n')

X_train : [[1.0 0.0 0.0 -0.7529426005471074 -0.6260377781240922]
 [1.0 0.0 0.0 1.008453807952985 1.013042950055349]
 [1.0 0.0 0.0 1.7912966561752484 1.8325833141450698]
 [0.0 1.0 0.0 -1.7314961608249366 -1.0943465576039326]
 [1.0 0.0 0.0 -0.3615211764359758 0.4276569757055486]
 [0.0 1.0 0.0 0.22561095973072173 0.05040823668012205]
 [0.0 0.0 1.0 -0.16581046438040992 -0.274806193514212]
 [0.0 0.0 1.0 -0.013591021670525248 -1.328500947343853]]

X_test: [[0.0 1.0 0.0 50.00000000000001 83000.0]
 [0.0 0.0 1.0 27.000000000000004 48000.0]]

y_train[1 0 1 0 1 1 0 0]

y_test[0 1]

