![ML Pipeline](https://media.geeksforgeeks.org/wp-content/uploads/20241022160725494723/supervised-machine-learning.webp)


# Step 1: Import All Essential Libraries

1. Import Libraries

1.1 Import pandas, numpy, matplotlib, seaborn, sklearn, and joblib.

1.2 These libraries help with data handling, visualization, training, and saving models.

1.3 Always do this step first to make all tools ready.

In [None]:
# Data Handling
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Machine Learning Models (you can expand later)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

# Model Evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, r2_score, mean_squared_error


# Step 2: Load and Inspect the Dataset

2. Load and Inspect Dataset


2.1 Load data using pd.read_csv() or similar functions.

2.2 Check data shape using df.shape and view top rows using df.head().

2.3 Use df.info() and df.describe() to see data types and basic statistics.

2.4 Check for missing values with df.isnull().sum().

2.5 Check for duplicate rows using df.duplicated().sum().

In [None]:
data_path = "your_dataset.csv"

try:
    df = pd.read_csv(data_path)
    print(" Dataset loaded successfully!\n")
except FileNotFoundError:
    print(" File not found. Check your path or filename.")

# ---- Basic Overview ----
print(" Shape:", df.shape)
print("\n Columns:", df.columns.tolist())

print("\n Data Types:")
print(df.dtypes)

print("\n Summary Statistics (Numerical Features):")
display(df.describe())

# ---- Missing Values ----
print("\n Missing Values per Column:")
print(df.isnull().sum())

# ---- Unique Values (for categorical inspection) ----
print("\n Unique Values (first 5 categorical columns):")
for col in df.select_dtypes(include='object').columns[:5]:
    print(f"{col}: {df[col].nunique()} unique values")

# ---- Correlation (for numeric features only) ----
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.title(" Correlation Heatmap")
plt.show()


# Step 3: Data Preprocessing (Cleaning + Encoding + Scaling)


3. Data Preprocessing


3.1 Handle Missing Values – remove with df.dropna() or fill with fillna().

3.2 Encode Categorical Data – use LabelEncoder() or pd.get_dummies().

3.3 Split Data – use train_test_split() to make training and testing sets.

3.4 Scale Features – use StandardScaler() to keep numeric values in the same range.

In [None]:
# ---- Handle Missing Values ----
print("\n Handling Missing Values...")

# Option 1: Drop rows with missing values
df = df.dropna()

# (Alternative Option)
# You can also fill missing values instead of dropping:
# df['column_name'].fillna(df['column_name'].mean(), inplace=True)

print(" Missing values handled successfully!")

# ---- Encode Categorical Columns ----
print("\n Encoding Categorical Columns...")

from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()

for col in df.select_dtypes(include='object').columns:
    df[col] = label_enc.fit_transform(df[col])

print(" Categorical columns encoded!")

# ---- Feature & Target Split ----
print("\n Splitting features and target variable...")

# Replace 'target' with your actual target column name
X = df.drop('target', axis=1)
y = df['target']

print(" Data split complete.")
print("Feature shape:", X.shape)
print("Target shape:", y.shape)

# ---- Train-Test Split ----
print("\n Splitting into Train & Test Sets...")

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print(" Train-Test split complete!")
print(f"Training samples: {X_train.shape[0]} | Testing samples: {X_test.shape[0]}")

# ---- Feature Scaling ----
print("\n Scaling Numerical Features...")

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(" Feature scaling done successfully!")

# Outlier Removal Example (using IQR)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]


# ---- Encoding Categorical Columns ---
# Example categorical columns (replace with yours)
# df['Color'], df['Country']

#  Label Encoding → use for tree models (DecisionTree, RandomForest, XGBoost)
le = LabelEncoder()
for col in ['Color', 'Country']:
    df[col] = le.fit_transform(df[col])
print(" Label Encoding done!")

#  One-Hot Encoding → use for linear models (Logistic, SVM, KNN)
# df = pd.get_dummies(df, columns=['Color', 'Country'], drop_first=True)
# print(" One-Hot Encoding done!")

# STEP 4 – Model Training & Evaluation (Full Detailed Version)


4. Model Training and Testing


4.1 Choose task type: classification or regression.

4.2 Select algorithms:
- Classification → Logistic Regression, Decision Tree, Random Forest, SVM, KNN.
- Regression → Linear Regression, Decision Tree Regressor, Random Forest Regressor, SVR, KNN Regressor.

4.3 Train model using fit(X_train, y_train).

4.4 Predict results using predict(X_test).

4.5 Check performance using accuracy or R² score.

In [None]:
# STEP 4: Model Training & Evaluation
print("\n Starting Model Training & Evaluation...")

#  Choose your task type
task_type = "classification"   # change to "regression" if needed


# ====== CLASSIFICATION MODELS ======
if task_type == "classification":
    print("\n Training Classification Models...\n")

    # 1. Logistic Regression
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("Logistic Regression Accuracy:", accuracy_score(y_test, pred))

    # 2. Decision Tree
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("Decision Tree Accuracy:", accuracy_score(y_test, pred))

    # 3. Random Forest
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("Random Forest Accuracy:", accuracy_score(y_test, pred))

    # 4. SVM
    model = SVC()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("SVM Accuracy:", accuracy_score(y_test, pred))

    # 5. KNN
    model = KNeighborsClassifier()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("KNN Accuracy:", accuracy_score(y_test, pred))


# ====== REGRESSION MODELS ======
else:
    print("\n Training Regression Models...\n")

    # 1. Linear Regression
    model = LinearRegression()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("Linear Regression R²:", r2_score(y_test, pred))

    # 2. Decision Tree
    model = DecisionTreeRegressor()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("Decision Tree R²:", r2_score(y_test, pred))

    # 3. Random Forest
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("Random Forest R²:", r2_score(y_test, pred))

    # 4. SVR
    model = SVR()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("SVR R²:", r2_score(y_test, pred))

    # 5. KNN
    model = KNeighborsRegressor()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("KNN R²:", r2_score(y_test, pred))


# STEP 5: Saving Model & Predicting New Data (All Models Listed)

5. Save and Predict


5.1 Save trained model using joblib.dump(model, 'my_model.pkl').

5.2 Load it later using joblib.load('my_model.pkl').

5.3 Make predictions on new data using predict().

In [None]:
# STEP 5: Save Model & Predict New Data
print("\n Saving Model & Making Predictions...")

import joblib  # used for saving and loading ML models


# ✅ Choose the model you want to save & use
# Uncomment only ONE of the following

# model = LogisticRegression(max_iter=1000)       # For classification
# model = DecisionTreeClassifier()                # For classification
# model = RandomForestClassifier()                # For classification
# model = SVC()                                   # For classification
# model = KNeighborsClassifier()                  # For classification

# model = LinearRegression()                      # For regression
# model = DecisionTreeRegressor()                 # For regression
# model = RandomForestRegressor()                 # For regression
# model = SVR()                                   # For regression
# model = KNeighborsRegressor()                   # For regression


#  Train the selected model again (you can skip if already trained)
model.fit(X_train, y_train)

# ---- Save the trained model ----
joblib.dump(model, "my_model.pkl")
print(" Model saved as 'my_model.pkl'")

# ---- Load the saved model ----
loaded_model = joblib.load("my_model.pkl")
print("Model loaded successfully!")

# ---- Predict using new data ----
# Example: replace values with your real input data
# (must match the same number of features as X_train)
sample_data = [[5.1, 3.5, 1.4, 0.2]]  # example for 4 features
prediction = loaded_model.predict(sample_data)

print("\n Prediction for sample data:", prediction)


# STEP 6: Model Evaluation (All Metrics + Confusion Matrix)

6. Model Evaluation


6.1 For classification:
- Use accuracy, precision, recall, F1-score, and confusion matrix.

6.2 For regression:
- Use R² score, Mean Squared Error (MSE), and Root Mean Squared Error (RMSE).

6.3 Compare all models.

6.4 The model with higher accuracy or R² and lower error is the best.

In [None]:
# STEP 6: Model Evaluation (Accuracy, Precision, Recall, F1, R2, Confusion Matrix)
print("\n Evaluating Model Performance...")

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report,
    r2_score, mean_squared_error
)

#  Choose your task
task_type = "classification"   # or "regression"

# Use your trained model (make sure you already trained one in Step 4 or Step 5)
# Example:
# model = RandomForestClassifier()
# model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# ----- CLASSIFICATION METRICS -----
if task_type == "classification":
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print("\n Classification Metrics:")
    print(f"Accuracy : {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall   : {rec:.3f}")
    print(f"F1-Score : {f1:.3f}")

    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nDetailed Report:\n", classification_report(y_test, y_pred))

# ----- REGRESSION METRICS -----
else:
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5

    print("\n Regression Metrics:")
    print(f"R² Score: {r2:.3f}")
    print(f"MSE     : {mse:.3f}")
    print(f"RMSE    : {rmse:.3f}")
