Lab 2: Data Preprocessing and Cleaning
This script demonstrates various data preprocessing and cleaning techniques.

In [None]:

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


In [None]:
def create_sample_data():
    """Create sample dataset with missing values and outliers"""
    np.random.seed(42)
    data = {
        'Age': [25, 30, np.nan, 35, 28, 32, np.nan, 40, 22, 150],  # Contains missing values and outlier
        'Salary': [50000, 60000, 55000, np.nan, 58000, 62000, 54000, 70000, np.nan, 65000],
        'Category': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B', 'A', 'C'],
        'Score': [85, 90, 78, 92, 88, 75, 95, 82, 89, 91]
    }
    return pd.DataFrame(data)


In [None]:
def handle_missing_values(df):
    """Demonstrate different techniques to handle missing values"""
    print("=" * 50)
    print("Handling Missing Values")
    print("=" * 50)
    
    print("\nOriginal Data with Missing Values:")
    print(df)
    print(f"\nMissing values per column:\n{df.isnull().sum()}")
    
    # Method 1: Fill with mean
    df_mean = df.copy()
    df_mean['Age'].fillna(df_mean['Age'].mean(), inplace=True)
    df_mean['Salary'].fillna(df_mean['Salary'].mean(), inplace=True)
    print("\nAfter filling with mean:")
    print(df_mean)
    
    # Method 2: Fill with median
    df_median = df.copy()
    df_median['Age'].fillna(df_median['Age'].median(), inplace=True)
    df_median['Salary'].fillna(df_median['Salary'].median(), inplace=True)
    print("\nAfter filling with median:")
    print(df_median)
    
    # Method 3: Using SimpleImputer
    df_imputer = df.copy()
    imputer = SimpleImputer(strategy='mean')
    df_imputer[['Age', 'Salary']] = imputer.fit_transform(df_imputer[['Age', 'Salary']])
    print("\nAfter using SimpleImputer:")
    print(df_imputer)
    
    return df_imputer


In [None]:
def handle_outliers(df):
    """Detect and handle outliers using IQR method"""
    print("\n" + "=" * 50)
    print("Handling Outliers")
    print("=" * 50)
    
    # Detect outliers using IQR
    Q1 = df['Age'].quantile(0.25)
    Q3 = df['Age'].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    print(f"\nAge statistics:")
    print(f"Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")
    print(f"Lower bound: {lower_bound}, Upper bound: {upper_bound}")
    
    # Identify outliers
    outliers = df[(df['Age'] < lower_bound) | (df['Age'] > upper_bound)]
    print(f"\nOutliers detected:\n{outliers}")
    
    # Remove outliers
    df_clean = df[(df['Age'] >= lower_bound) & (df['Age'] <= upper_bound)]
    print(f"\nData after removing outliers:")
    print(df_clean)
    
    return df_clean


In [None]:
def encode_categorical_variables(df):
    """Encode categorical variables"""
    print("\n" + "=" * 50)
    print("Encoding Categorical Variables")
    print("=" * 50)
    
    print(f"\nOriginal Category column:\n{df['Category'].value_counts()}")
    
    # Label Encoding
    df_encoded = df.copy()
    label_encoder = LabelEncoder()
    df_encoded['Category_Encoded'] = label_encoder.fit_transform(df_encoded['Category'])
    print(f"\nAfter Label Encoding:")
    print(df_encoded[['Category', 'Category_Encoded']])
    
    # One-Hot Encoding
    df_onehot = pd.get_dummies(df, columns=['Category'], prefix='Category')
    print(f"\nAfter One-Hot Encoding:")
    print(df_onehot)
    
    return df_encoded


In [None]:
def normalize_scale_data(df):
    """Demonstrate normalization and scaling techniques"""
    print("\n" + "=" * 50)
    print("Normalization and Scaling")
    print("=" * 50)
    
    # Select numeric columns
    numeric_cols = ['Age', 'Salary', 'Score']
    
    print(f"\nOriginal data statistics:")
    print(df[numeric_cols].describe())
    
    # Standardization (Z-score normalization)
    scaler = StandardScaler()
    df_standardized = df.copy()
    df_standardized[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    print(f"\nAfter Standardization (mean=0, std=1):")
    print(df_standardized[numeric_cols].describe())
    
    # Min-Max Scaling
    minmax_scaler = MinMaxScaler()
    df_normalized = df.copy()
    df_normalized[numeric_cols] = minmax_scaler.fit_transform(df[numeric_cols])
    print(f"\nAfter Min-Max Scaling (range 0-1):")
    print(df_normalized[numeric_cols].describe())
    
    return df_standardized


In [None]:
def split_data(df):
    """Split data into training and testing sets"""
    print("\n" + "=" * 50)
    print("Data Splitting")
    print("=" * 50)
    
    # Prepare features and target
    X = df[['Age', 'Salary', 'Score']]
    y = df['Category_Encoded'] if 'Category_Encoded' in df.columns else LabelEncoder().fit_transform(df['Category'])
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print(f"\nTotal samples: {len(X)}")
    print(f"Training samples: {len(X_train)}")
    print(f"Testing samples: {len(X_test)}")
    print(f"Train/Test split ratio: 80/20")
    
    return X_train, X_test, y_train, y_test


In [None]:
def main():
    """Main function to demonstrate data preprocessing"""
    print("\n" + "=" * 50)
    print("Lab 2: Data Preprocessing and Cleaning")
    print("=" * 50)
    
    # Create sample data
    df = create_sample_data()
    
    # Handle missing values
    df_clean = handle_missing_values(df)
    
    # Handle outliers
    df_clean = handle_outliers(df_clean)
    
    # Encode categorical variables
    df_encoded = encode_categorical_variables(df_clean)
    
    # Normalize and scale
    df_scaled = normalize_scale_data(df_encoded)
    
    # Split data
    X_train, X_test, y_train, y_test = split_data(df_encoded)
    
    print("\n" + "=" * 50)
    print("Lab 2 Complete!")
    print("=" * 50)


In [None]:
if __name__ == "__main__":
    main()
