In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Creating a sample dataset
data = {
    'A': [10, 20, 30, np.nan, 50, 60, 70, np.nan, 90, 100],  # Continuous with missing values
    'B': [5, 15, np.nan, 35, 45, 55, np.nan, 75, 85, 95],  # Continuous with missing values
    'C': ['Red', 'Blue', 'Green', np.nan, 'Red', 'Green', 'Blue', 'Blue', np.nan, 'Red'],  # Categorical
    'D': [100, 200, 300, 400, 500, np.nan, 700, 800, 900, 1000],  # Continuous with outlier
    'E': pd.date_range(start='1/1/2024', periods=10, freq='D')  # Time series
}

df = pd.DataFrame(data)

# Introduce missing values in the time series column
df.loc[[3, 7], 'E'] = np.nan

print("Original Dataset:")
print(df)

# Handling Missing Values
df['A'] = df['A'].fillna(df['A'].mean())  # Mean for continuous data
df['B'] = df['B'].fillna(df['B'].median())  # Median for continuous data with outliers
df['C'] = df['C'].fillna(df['C'].mode()[0])  # Mode for categorical data
df['D'] = df['D'].fillna(df['D'].median())  # Median for continuous data with outliers
df['E'] = df['E'].fillna(method='ffill')  # Forward fill for time series

# KNN Imputation (reapplying after basic imputation)
knn_imputer = KNNImputer(n_neighbors=3)
numeric_cols = ['A', 'B', 'D']  # Only numerical columns for KNN
df[numeric_cols] = knn_imputer.fit_transform(df[numeric_cols])

print("\nDataset after Handling Missing Values:")
print(df)

# Feature Scaling (Excluding categorical & time columns)
scalers = {
    "Min-Max Scaling": MinMaxScaler(),
    "Standardization": StandardScaler()
}

scaled_dfs = {}
for key, scaler in scalers.items():
    scaled_data = scaler.fit_transform(df[numeric_cols])  
    scaled_dfs[key] = pd.DataFrame(scaled_data, columns=numeric_cols)
    
    print(f"\n{key}:")
    print(scaled_dfs[key].head())


Original Dataset:
       A     B      C       D          E
0   10.0   5.0    Red   100.0 2024-01-01
1   20.0  15.0   Blue   200.0 2024-01-02
2   30.0   NaN  Green   300.0 2024-01-03
3    NaN  35.0    NaN   400.0        NaT
4   50.0  45.0    Red   500.0 2024-01-05
5   60.0  55.0  Green     NaN 2024-01-06
6   70.0   NaN   Blue   700.0 2024-01-07
7    NaN  75.0   Blue   800.0        NaT
8   90.0  85.0    NaN   900.0 2024-01-09
9  100.0  95.0    Red  1000.0 2024-01-10

Dataset after Handling Missing Values:
        A     B      C       D          E
0   10.00   5.0    Red   100.0 2024-01-01
1   20.00  15.0   Blue   200.0 2024-01-02
2   30.00  50.0  Green   300.0 2024-01-03
3   53.75  35.0   Blue   400.0 2024-01-03
4   50.00  45.0    Red   500.0 2024-01-05
5   60.00  55.0  Green   500.0 2024-01-06
6   70.00  50.0   Blue   700.0 2024-01-07
7   53.75  75.0   Blue   800.0 2024-01-07
8   90.00  85.0   Blue   900.0 2024-01-09
9  100.00  95.0    Red  1000.0 2024-01-10

Min-Max Scaling:
          A

  df['E'] = df['E'].fillna(method='ffill')  # Forward fill for time series
