In [None]:
#SIMPLE IMPUTER

# Import necessary libraries
import pandas as pd
from sklearn.impute import SimpleImputer

'''scikit-learn (commonly abbreviated as sklearn) is a powerful and widely-used open-source Python library for machine learning.
It provides simple, efficient tools for data mining and analysis, and is built on top of other essential libraries like NumPy, SciPy, and Matplotlib.'''

# Sample DataFrame with missing values (None)
data = {'Age': [25, 30, None, 35, 40, None],
        'Salary': [50000, 60000, 65000, None, 70000, 72000]}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Method 1: Imputing missing values with the mean
# Create an imputer object with the strategy as 'mean'
mean_imputer = SimpleImputer(strategy='mean')
'''SimpleImputer: This is a part of scikit-learn, and it allows you to specify a strategy (like mean, median, most frequent) to fill missing values.
Mean Strategy: Replaces missing values with the mean of the column.
Median Strategy: Replaces missing values with the median.'''

# Fit and transform the data (only the columns with missing values)
df[['Age', 'Salary']] = mean_imputer.fit_transform(df[['Age', 'Salary']]) 
'''It’s a combination of two actions: fitting a transformation (i.e., calculating the necessary parameters, like mean or scale factor)
and applying that transformation to the data in a single step.'''

print("\nDataFrame after imputing missing values with mean:")
print(df)

# Method 2: Imputing missing values with median (just for example)
median_imputer = SimpleImputer(strategy='median')

df[['Age', 'Salary']] = median_imputer.fit_transform(df[['Age', 'Salary']])

print("\nDataFrame after imputing missing values with median:")
print(df)

In [None]:
#HANDLING CATEGORICAL DATA
'''Handling categorical data is a crucial step in machine learning, as most algorithms work with numerical data.
There are several techniques to convert categorical data into numerical format, with three popular ones being:

get_dummies() (from pandas)
LabelEncoder (from sklearn)
OneHotEncoder (from sklearn)'''

# 1. pandas.get_dummies()
'''A function in pandas that converts categorical variable(s) into dummy/indicator variables. 
Each category becomes a new column with binary values (0 or 1).
Best for categorical features that do not have an ordinal relationship (e.g., colors, names).'''

import pandas as pd

# Sample DataFrame with categorical data
data = pd.DataFrame({'Color': ['Red', 'Blue', 'Green']})

# Convert the categorical 'Color' column into dummy variables
# 'prefix' adds a label to column names to indicate it's related to 'Color'
dummies = pd.get_dummies(data['Color'], prefix='Color')
print(dummies)

# 2. LabelEncoder()
'''A class in scikit-learn that converts categorical labels into numeric format by assigning each unique category an integer value.
Best for ordinal categorical features where the order matters (e.g., ratings like 'low', 'medium', 'high').'''

from sklearn.preprocessing import LabelEncoder

# Sample Data: List of categorical colors
data = ['Red', 'Blue', 'Green']

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder and transform the data into numeric labels
encoded_labels = label_encoder.fit_transform(data)
print(encoded_labels)

# 3. OneHotEncoder()
'''A class in scikit-learn that converts categorical variables into a format that can be provided to ML algorithms to improve predictions.
It creates a new binary column for each category.
Similar to get_dummies, best for nominal categorical features without ordinal relationships.'''

from sklearn.preprocessing import OneHotEncoder

# Sample Data: A list of lists (each inner list is a sample)
data = [['Red'], ['Blue'], ['Green']]

# Create an instance of OneHotEncoder with sparse_output set to False to get a dense array
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the data into one-hot encoded format
one_hot_encoded = encoder.fit_transform(data)
print(one_hot_encoded)

In [None]:
#SPLITTING DATA (TRAIN/TEST SPLIT)
'''A train/test split is a common technique in machine learning used to evaluate the performance of a model. The dataset is divided into two parts:

Training set: Used to train the model. The model learns patterns and relationships from this subset of data.
Testing set: Used to evaluate how well the trained model performs on unseen data. This helps assess its generalization capability.

The typical ratio for splitting data is:

80% training and 20% testing, 
or
70% training and 30% testing.'''

from sklearn.model_selection import train_test_split
import numpy as np

# Example feature data (X) and target data (y)
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]])
y = np.array([0, 1, 0, 1, 0, 1, 0, 1])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
'''test_size=0.2: 20% of the data will be held out for testing.
random_state=42: Ensures the split is reproducible.'''

# Print the results
print("X_train:", X_train)
print("X_test:", X_test)
print("y_train:", y_train)
print("y_test:", y_test)