In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import pickle

class Data:
    def __init__(self):
        # Assign commonly used libraries and functions as class attributes for convenience
        self.pd = pd                         # Pandas for data manipulation and analysis
        self.plt = plt                       # Matplotlib for plotting graphs
        self.sns = sns                       # Seaborn for statistical data visualization
        self.train_test_split = train_test_split  # Scikit-learn function to split data into training and test sets
        self.StandardScaler = StandardScaler      # Scikit-learn class for feature scaling
        self.LabelEncoder = LabelEncoder          # Scikit-learn class for encoding categorical labels
        self.SVR = SVR                             # Support Vector Regression model from scikit-learn
        self.mean_squared_error = mean_squared_error  # Function to evaluate model performance (MSE)
        self.r2_score = r2_score                    # Function to evaluate model performance (R¬≤ score)
        self.pickle = pickle                        # Python module for serializing and saving models


Data Preprocessing: 
The DataPreprocessing class prepares raw data for machine learning by handling missingvalues, encoding categorical features, and scaling numerical data.

In [21]:
class DataPreprocessing(Data):
    def __init__(self, file_path):
        super().__init__()
        self.file_path = file_path
        self.df = None
        self.scaler = self.StandardScaler()
        self.read_data()

    # 1. Read data
    def read_data(self):
        self.df = self.pd.read_csv(self.file_path)
        print(f"‚úÖ Data loaded successfully from: {self.file_path}")

    # 2. Rename columns
    def rename_columns(self):
        self.df.columns = [col.strip().title().replace("_", "") for col in self.df.columns]
        print("‚úÖ Columns renamed successfully.")

    # 3. Show all columns
    def show_columns(self):
        print("üìã Columns in dataset:\n", self.df.columns.tolist())

    # 4. Show shape
    def show_shape(self):
        print("üìä Dataset shape:", self.df.shape)

    # 5. Show dataset info
    def show_info(self):
        print(self.df.info())

    # 6. Describe dataset
    def describe_data(self):
        print("üìà Dataset Description:\n", self.df.describe())

    # 7. Show min & max
    def show_min_max(self):
        print("Minimum values:\n", self.df.min())
        print("\nMaximum values:\n", self.df.max())

    # 8. Mean, Mode, Median
    def show_mean_mode_median(self, column):
        mean_val = self.df[column].mean()
        mode_val = self.df[column].mode()[0]
        median_val = self.df[column].median()
        print(f"Column: {column}")
        print(f"Mean: {mean_val}")
        print(f"Mode: {mode_val}")
        print(f"Median: {median_val}")

    # 9. Check null values
    def check_nulls(self):
        print("üßæ Null Values Summary:\n", self.df.isnull().sum())

    # 10. Handle missing values
    def handle_missing_values(self, strategy="fill"):
        if strategy == "fill":
            for col in self.df.columns:
                if self.df[col].isnull().sum() > 0:
                    if self.df[col].dtype == 'object':
                        self.df[col].fillna(self.df[col].mode()[0], inplace=True)
                    else:
                        self.df[col].fillna(self.df[col].mean(), inplace=True)
            print("‚úÖ Missing values filled with mean/mode.")
        elif strategy == "drop":
            self.df.dropna(inplace=True)
            print("üóëÔ∏è Rows with missing values dropped.")

    # 11. Display data preview
    def basic_data_load(self, numbers=5, mode="head"):
        mode = mode.lower().strip()
        if mode == "head":
            print(self.df.head(numbers))
        elif mode == "tail":
            print(self.df.tail(numbers))
        elif mode == "sample":
            print(self.df.sample(numbers))
        else:
            print("‚ùå Invalid mode! Choose from: head, tail, sample.")


In [22]:
file_path = r"C:\Users\Administrator\Desktop\AI(LAB)\Task 10\data.csv"

# Step 1: Create object
data_processor = DataPreprocessing(file_path)

# Step 2: Rename columns
data_processor.rename_columns()

# Step 3: Show all columns
data_processor.show_columns()

# Step 4: Show dataset shape
data_processor.show_shape()

# Step 5: Show info about dataset
data_processor.show_info()

# Step 6: Describe data (statistics)
data_processor.describe_data()

# Step 7: Show min & max values
data_processor.show_min_max()

# Step 8: Show mean, mode, and median of 'Grades' (change if your column name differs)
data_processor.show_mean_mode_median(column="Grades")

# Step 9: Check null values
data_processor.check_nulls()

# Step 10: Handle missing values (choose 'drop' or 'fill')
data_processor.handle_missing_values(strategy="fill")

# Step 11: Show first 5 rows
data_processor.basic_data_load(numbers=5, mode="head")

# Step 12: Show last 10 rows
data_processor.basic_data_load(numbers=10, mode="tail")

# Step 13: Show random 7 rows
data_processor.basic_data_load(numbers=7, mode="sample")


‚úÖ Data loaded successfully from: C:\Users\Administrator\Desktop\AI(LAB)\Task 10\data.csv
‚úÖ Columns renamed successfully.
üìã Columns in dataset:
 ['StudentId', 'StudyHoursPerDay', 'ExtracurricularHoursPerDay', 'SleepHoursPerDay', 'SocialHoursPerDay', 'PhysicalActivityHoursPerDay', 'StressLevel', 'Gender', 'Grades']
üìä Dataset shape: (149, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   StudentId                    149 non-null    int64  
 1   StudyHoursPerDay             149 non-null    float64
 2   ExtracurricularHoursPerDay   149 non-null    float64
 3   SleepHoursPerDay             149 non-null    float64
 4   SocialHoursPerDay            149 non-null    float64
 5   PhysicalActivityHoursPerDay  149 non-null    float64
 6   StressLevel                  149 non-null    object 
 7   Gender            