### Data Preprocessing Exercies

In [None]:
# Import all the libraries we need for data preprocessing
import pandas as pd  # pandas: helps us work with data in tables (like Excel)
import numpy as np  # numpy: helps us do mathematical operations on data
from sklearn.compose import ColumnTransformer  # ColumnTransformer: applies different transformations to different columns
from sklearn.impute import SimpleImputer  # SimpleImputer: fills in missing values with a strategy (like average)
from sklearn.model_selection import train_test_split  # train_test_split: splits our data into training and testing sets
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler  # OneHotEncoder: converts text categories to numbers, LabelEncoder: converts labels to numbers, StandardScaler: scales numbers to similar ranges

In [None]:
# Load the dataset (the .csv file) into a pandas DataFrame (table format)
df = pd.read_csv('loan.csv')  # Read the loan.csv file from disk
print(f'Loading the dataset:\n')  # Print a message to show what we're doing
df.head()  # Show the first 5 rows of the data to see what it looks like

Loading the dataset:



Unnamed: 0,CustomerID,Gender,Married,Education,Income,LoanAmount,CreditScore,PropertyArea,LoanApproved
0,101,Male,Yes,Graduate,55000.0,150000.0,720.0,Urban,Yes
1,102,Female,Yes,Graduate,48000.0,130000.0,680.0,Semiurban,Yes
2,103,Male,No,Not Graduate,32000.0,70000.0,,Rural,No
3,104,Male,Yes,Graduate,65000.0,200000.0,750.0,Urban,Yes
4,105,Female,,Graduate,42000.0,95000.0,690.0,Semiurban,Yes


In [None]:
# Separate the features (X) from the target/label (y)
# X = all columns except the first (ID) and last (target) columns - these are the input features
# y = only the last column - this is what we want to predict (the target)
X = df.iloc[:,1:-1].values  # Get all rows, columns from 1 to the second-to-last column
y = df.iloc[:,-1].values  # Get all rows, only the last column (our target to predict)

In [None]:
# Define which columns are categorical (text/categories) and which are numerical (numbers)
# This helps us know how to process each type of data differently
categorical_columns = ['Gender', 'Married', 'Education', 'PropertyArea']  # Text columns that represent categories
numerical_columns = ['Income', 'LoanAmount', 'CreditScore']  # Numerical columns with actual numbers

In [None]:
# Handle missing data - fill in the empty values in our dataset
missing_data = df.isnull().sum()  # Count how many missing values are in each column
print(f'Missing data in each column:\n {missing_data}')  # Show the count of missing values

# Use SimpleImputer to fill missing numerical values with their average
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')  # Create an imputer that uses the mean (average)
X[:,3:6] = imputer.fit_transform(X[:,3:6])  # Fill missing values in columns 3-5 with their average

# Use SimpleImputer to fill missing categorical values with the most frequent value
impute = SimpleImputer(missing_values=np.nan, strategy='most_frequent')  # Create an imputer that uses the most common value
X[:,0:1] = impute.fit_transform(X[:,0:1])  # Fill missing values in column 0 with the most frequent value

Missing data in each column:
 CustomerID      0
Gender          1
Married         2
Education       0
Income          2
LoanAmount      2
CreditScore     2
PropertyArea    0
LoanApproved    0
dtype: int64


In [None]:
# Convert categorical text features into numerical format so machine learning models can understand them
# ColumnTransformer applies different transformations to different columns
ct = ColumnTransformer(transformers=[('arson',OneHotEncoder(),[0,1,2,6])],remainder='passthrough')  # OneHotEncoder converts categories (like "Male"/"Female") into 0s and 1s
X = np.array(ct.fit_transform(X))  # Apply the transformation and convert to numpy array

# Convert the target labels (y) from text to numbers using LabelEncoder
le = LabelEncoder()  # Create a label encoder
y = le.fit_transform(y)  # Transform labels like "Yes"/"No" into 0/1

In [None]:
# Split the data into training and testing sets
# Training set: used to teach the machine learning model (80% of data)
# Testing set: used to test how well the model learned (20% of data)
X_train, X_test , y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=7)  # Split data: 80% train, 20% test; random_state=7 ensures reproducible results

In [None]:
# Scale/normalize the numerical features so they are on a similar scale
# This helps machine learning models work better (e.g., making values between -1 and 1)
sc = StandardScaler()  # Create a scaler that will standardize the values
X_train[:,10:] = sc.fit_transform(X_train[:,10:])  # Scale columns 10 onwards in training data and learn the scaling parameters

# Apply the same scaling to the test data using the parameters learned from training data
X_test[:,10:] = sc.transform(X_test[:,10:])  # Scale the test data using the same parameters (important: don't fit on test data)

In [None]:
# Display the final preprocessed data to verify our work is complete
# These are now ready to be used by a machine learning model!
print(f'X_train:\n {X_train}')  # Print the scaled training features
print(f'X_test:\n {X_test}')  # Print the scaled testing features
print(f'y_train:\n {y_train}')  # Print the training target labels
print(f'y_test: {y_test}')  # Print the testing target labels

X_train:
 [[0.0 1.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 -1.2543801660254887
  -1.0908012177934066 -0.288225028525472]
 [0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 2.0548780384093366
  2.1464152995289614 1.765850283935534]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.1817130170311336
  0.03518713605785183 0.19508680970064743]
 [1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 -0.2553588212904471
  -0.24630995240496278 -0.5902949274167958]
 [0.0 1.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 -0.1592990766043854
  -0.9500526735619993 -2.1610584016516823]
 [1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 -0.3177976553363872
  -0.3870584966363701 -0.7866403616961566]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 -0.005603485106686711
  -0.10556140817355547 -0.1976040588580742]
 [1.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 -1.6914520043470695
  -1.3019240341405176 -0.288225028525472]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.8061013574905347
  0.7389298572148884 0.7841231125387299]
 [1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.4304896