# Data Preprocessing Tools

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
# Import the pandas library for data manipulation and analysis
import pandas as pd

# Load the dataset from a CSV file named 'Data.csv' into a pandas DataFrame
dataset = pd.read_csv('Data.csv')

# Select all rows and all columns except the last one as features (independent variables)
# .iloc[:, :-1] means "all rows, all columns except the last"
# .values converts the DataFrame to a NumPy array
X = dataset.iloc[:, :-1].values

# Select all rows of the last column as the target (dependent variable)
# .iloc[:, -1] means "all rows, last column"
# .values converts the Series to a NumPy array
y = dataset.iloc[:, -1].values

In [None]:
print(X)

In [None]:
print(y)

## Taking care of missing data

In [None]:
from sklearn.impute import SimpleImputer

# Create an instance of SimpleImputer to replace missing values (np.nan)
# The strategy 'mean' means missing values will be replaced with the mean of the column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Fit the imputer on columns 1 and 2 (second and third columns) of X
# This step calculates the mean of each selected column (ignoring NaN values)
imputer.fit(X[:, 1:3])

# Transform the selected columns in X by replacing missing values with the calculated means
# The result is assigned back to the same slice of X
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [None]:
print(X)

## Encoding categorical data

### Encoding the Independent Variable

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# - 'encoder' is just a name for this transformation step
# - OneHotEncoder() is the transformation applied
# - [0] means it will be applied to column index 0 (e.g., "Country" or another categorical column)
# - remainder='passthrough' tells it to leave the rest of the columns unchanged
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

# Apply the column transformer to X and convert the result into a NumPy array
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

### Encoding the Dependent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder

#  LabelEncoder to convert categorical labels (strings) into numeric form
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
print(y)

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [None]:
print(X_train)

In [None]:
print(X_test)

In [None]:
print(y_train)

In [None]:
print(y_test)

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

# This will scale the features to have mean = 0 and standard deviation = 1
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#Note: Many ML models perform better when features are on the same scale.Without scaling, models might wrongly give more importance to features with larger numbers.

In [None]:
print(X_train)

In [None]:
print(X_test)