In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import pickle

class Data:
    def __init__(self):
        # Assign commonly used libraries and functions as class attributes for convenience
        self.pd = pd                         # Pandas for data manipulation and analysis
        self.plt = plt                       # Matplotlib for plotting graphs
        self.sns = sns                       # Seaborn for statistical data visualization
        self.train_test_split = train_test_split  # Scikit-learn function to split data into training and test sets
        self.StandardScaler = StandardScaler      # Scikit-learn class for feature scaling
        self.LabelEncoder = LabelEncoder          # Scikit-learn class for encoding categorical labels
        self.SVR = SVR                             # Support Vector Regression model from scikit-learn
        self.mean_squared_error = mean_squared_error  # Function to evaluate model performance (MSE)
        self.r2_score = r2_score                    # Function to evaluate model performance (R² score)
        self.pickle = pickle                        # Python module for serializing and saving models


Data Preprocessing: 
The DataPreprocessing class prepares raw data for machine learning by handling missingvalues, encoding categorical features, and scaling numerical data.

In [57]:
class DataPreprocessing(Data):
    """
    DataPreprocessing Class

    Overview:
        This class extends the base Data class and provides essential preprocessing functionalities
        for handling CSV data in a machine learning pipeline. It includes methods to load data,
        rename columns, inspect the dataset, handle missing values, and perform basic statistical analysis.

    Purpose:
        To clean, explore, and prepare a dataset for machine learning by automating repetitive tasks
        like renaming columns, checking nulls, displaying stats, and handling missing values.
    """

    def __init__(self, file_path):
        super().__init__()  # Inherit methods and tools from the Data class
        self.file_path = file_path  # Store the file path
        self.df = None  # DataFrame placeholder
        self.scaler = self.StandardScaler()  # Initialize StandardScaler for later use
        if file_path:
            self.read_data(file_path)  # Automatically load data if file_path is provided

    def read_data(self, file_path):
        """Reads CSV file from the given path and loads it into a DataFrame."""
        self.file_path = file_path  # Save the file path
        self.df = self.pd.read_csv(file_path)  # Load data using pandas
        print("Data loaded successfully from:", file_path)

    def rename_columns(self):
        """Renames long or inconsistent column names for better readability."""
        self.df.rename(columns={
            "Student_ID": "Std_ID",
            "Study_Hours_Per_Day": "Stu_Hours",
            "Extracurricular_Hours_Per_Day": "Extra_Hours",
            "Sleep_Hours_Per_Day": "Sleep_Hours",
            "Physical_Activity_Hours_Per_Day": "Physical_Hours"
        }, inplace=True)  # Rename columns in place
        print("Columns renamed successfully.")
        print("New columns:", self.df.columns.tolist())

    def show_columns(self):
        """Displays the list of column names in the dataset."""
        df_copy = self.df.copy()  # Work on a copy to preserve original
        print("Dataset Columns:", df_copy.columns.tolist())  # Print column names

    def show_shape(self):
        """Prints the shape of the dataset (rows, columns)."""
        df_copy = self.df.copy()
        print("Dataset Shape:", df_copy.shape)  # Print number of rows and columns

    def show_info(self):
        """Displays information about dataset columns, data types, and memory usage."""
        df_copy = self.df.copy()
        print("Dataset Info:")
        print(df_copy.info())  # Show info including non-null counts, dtypes, memory usage

    def describe_data(self):
        """Prints descriptive statistics for all columns, including categorical ones."""
        df_copy = self.df.copy()
        print("Descriptive Statistics:")
        print(df_copy.describe(include="all"))  # Summary of stats (mean, std, count, etc.)

    def show_min_max(self):
        """Prints minimum and maximum values for each numeric column."""
        df_copy = self.df.copy()
        desc = df_copy.describe()  # Get descriptive stats
        print("Minimum values:\n", desc.loc["min"])  # Print min values
        print("Maximum values:\n", desc.loc["max"])  # Print max values

    def show_mean_mode_median(self, column="Grades"):
        """
        Displays mean, mode, and median for a specified column.

        Parameters:
            column (str): Column name to compute statistics for. Default is 'Grades'.
        """
        df_copy = self.df.copy()
        print(f"Statistics for '{column}':")
        print("Mean:", df_copy[column].mean())  # Compute and print mean
        print("Mode:", df_copy[column].mode()[0])  # Compute and print first mode value
        print("Median:", df_copy[column].median())  # Compute and print median

    def check_nulls(self):
        """Displays the number of missing values in each column."""
        df_copy = self.df.copy()
        print("Null value summary:\n", df_copy.isnull().sum())  # Count nulls per column

    def handle_missing_values(self, strategy="drop"):
        """
        Handles missing values in the dataset.

        Parameters:
            strategy (str): 'drop' to remove rows with missing values,
                            'fill' to fill with column means (numeric only).
        """
        if strategy == "drop":
            self.df.dropna(inplace=True)  # Drop rows with any nulls
            print("Missing values dropped.")
        elif strategy == "fill":
            # Fill only numeric columns with their mean values
            self.df.fillna(self.df.mean(numeric_only=True), inplace=True)
            print("Missing values filled with column means.")

    def basic_data_load(self, numbers=5, mode="head"):
        """
        Displays a sample of the dataset.

        Parameters:
            numbers (int): Number of rows to display.
            mode (str): 'head', 'tail', or 'sample' to specify which part of the dataset to show.
        """
        mode = mode.lower().strip()  # Normalize input
        df_copy = self.df.copy()
        if mode == "head":
            print(f"First {numbers} rows:")
            print(df_copy.head(numbers))  # Show first N rows
        elif mode == "tail":
            print(f"Last {numbers} rows:")
            print(df_copy.tail(numbers))  # Show last N rows
        elif mode == "sample":
            print(f"Random {numbers} rows:")
            print(df_copy.sample(numbers))  # Show random N rows
        else:
            print("Invalid mode! Please choose 'head', 'tail', or 'sample'.")


In [58]:
# Create an instance of DataPreprocessing class with the CSV file path
data_processor = DataPreprocessing("data.csv")

# 1. Rename columns to standardized names (modifies the internal df in-place)
data_processor.rename_columns()

# 2. Display all column names of the dataset
data_processor.show_columns()

# 3. Display the shape (rows, columns) of the dataset
data_processor.show_shape()

# 4. Display detailed information about dataset columns and data types
data_processor.show_info()

# 5. Show descriptive statistics including count, mean, std, min, max, etc.
data_processor.describe_data()

# 6. Display the minimum and maximum values for each numeric column
data_processor.show_min_max()

# 7. Show mean, mode, and median statistics specifically for the 'Grades' column
data_processor.show_mean_mode_median(column="Grades")

# 8. Check the dataset for any null or missing values per column
data_processor.check_nulls()

# 9. Handle missing values based on strategy:
#    "drop" — remove rows with missing values
#    "fill" — fill missing values with the column mean (numeric columns only)
data_processor.handle_missing_values(strategy="drop")  # or strategy="fill"

# 10. Show the first 5 rows of the dataset (default behavior)
data_processor.basic_data_load()  # numbers=5, mode='head' by default

# 11. Show the last 10 rows of the dataset
data_processor.basic_data_load(numbers=10, mode="tail")

# 12. Show a random sample of 7 rows from the dataset
data_processor.basic_data_load(numbers=7, mode="sample")


Data loaded successfully from: data.csv
Columns renamed successfully.
New columns: ['Std_ID', 'Stu_Hours', 'Extra_Hours', 'Sleep_Hours', 'Social_Hours_Per_Day', 'Physical_Hours', 'Stress_Level', 'Gender', 'Grades']
Dataset Columns: ['Std_ID', 'Stu_Hours', 'Extra_Hours', 'Sleep_Hours', 'Social_Hours_Per_Day', 'Physical_Hours', 'Stress_Level', 'Gender', 'Grades']
Dataset Shape: (149, 9)
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Std_ID                149 non-null    int64  
 1   Stu_Hours             149 non-null    float64
 2   Extra_Hours           149 non-null    float64
 3   Sleep_Hours           149 non-null    float64
 4   Social_Hours_Per_Day  149 non-null    float64
 5   Physical_Hours        149 non-null    float64
 6   Stress_Level          149 non-null    object 
 7   Gender                149 non-