In [8]:
import sys
import os
from typing import List

import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

sys.path.append("..")
from utils import load_data, StrokeParams, FetalHealthParams

RANDOM_STATE = 666


def preprocess_data(
    data: str,
    target_column: str,
    train_output: str,
    test_output: str,
    test_size: float = 0.2,
    random_state: int = RANDOM_STATE,
):
    # Load data
    df = pd.read_csv(data)

    # train test split
    X, y = df.drop(columns=[target_column]), df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # standardize
    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

    # concat X and y
    train = pd.concat([X_train, y_train.reset_index(drop=True)], axis=1)
    test = pd.concat([X_test, y_test.reset_index(drop=True)], axis=1)

    # save output
    for output, data in zip([train_output, test_output], [train, test]):
        path, file = os.path.split(output)

        if path and not os.path.exists(path):
            os.makedirs(path)

        data.to_csv(output, index=False)

    return


if __name__ == "__main__":
    preprocess_data(
        data="~/Projects/cs-7641-machine-learning/supervised_learning/data/healthcare-dataset-stroke-data_cleaned.csv",
        target_column=StrokeParams.target_column,
        train_output="../data/stroke_train.csv",
        test_output="../data/stroke_test.csv",
    )

    preprocess_data(
        data="~/Projects/cs-7641-machine-learning/supervised_learning/data/fetal_health.csv",
        target_column=FetalHealthParams.target_column,
        train_output="../data/fetal_health_train.csv",
        test_output="../data/fetal_health_test.csv",
    )