Import Library

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from scipy import stats

# Import Dataset

In [45]:
datasets = {
    "MBP": pd.read_csv("/content/MBP.csv"),
    "LTP": pd.read_csv("/content/LTP.csv", encoding='latin-1'),
    "HSP": pd.read_csv("/content/HSP.csv"),
    "FDC": pd.read_csv("/content/FDC.csv"),
    "STP": pd.read_csv("/content/STP.csv")
}

# Handle Missing Data

In [46]:
def handle_missing_data(df):
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            if df[col].dtype == 'object':
                df[col].fillna('Missing', inplace=True)
            else:
                df[col].fillna(df[col].median(), inplace=True)
    return df

# Remove Duplicates Data

In [47]:
def remove_duplicates(df):
    return df.drop_duplicates()

# Outliers Engineering

In [48]:
def handle_outliers(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        z_scores = np.abs(stats.zscore(df[col]))
        upper_limit = df[col].quantile(0.95)
        lower_limit = df[col].quantile(0.05)
        df[col] = np.clip(df[col], lower_limit, upper_limit)
    return df

# Feature Scaling

In [49]:
def scale_features(df):
    scaler = MinMaxScaler()
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

# Categorical Encoding

In [50]:
def encode_categorical(df):
    cat_cols = df.select_dtypes(include=['object']).columns
    encoder = LabelEncoder()
    for col in cat_cols:
        df[col] = encoder.fit_transform(df[col])
    return df

# Apply Preprocessing ke Semua Dataset

In [51]:
for name, df in datasets.items():
    print(f"Processing {name} dataset...")
    df_before = df.describe(include='all')
    df = handle_missing_data(df)
    df = remove_duplicates(df)
    df = handle_outliers(df)
    df = scale_features(df)
    df = encode_categorical(df)
    datasets[name] = df

    output_dir = "/mnt/data"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    df.to_csv(f"{output_dir}/{name}_processed.csv", index=False)
    print(f"{name} dataset processed and saved.")

Processing MBP dataset...
MBP dataset processed and saved.
Processing LTP dataset...
LTP dataset processed and saved.
Processing HSP dataset...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Missing', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = np.clip(df[col], lower_limit, upper_limit)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

HSP dataset processed and saved.
Processing FDC dataset...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Missing', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


FDC dataset processed and saved.
Processing STP dataset...
STP dataset processed and saved.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Missing', inplace=True)


# Summary

In [52]:
for name, df in datasets.items():
    print(f"{name} dataset shape: {df.shape}")

MBP dataset shape: (1767, 11)
LTP dataset shape: (1303, 13)
HSP dataset shape: (1460, 81)
FDC dataset shape: (125, 61)
STP dataset shape: (1044, 35)
