In [1]:
import polars as pl
import os
from sklearn.model_selection import train_test_split
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [None]:
def split_and_save_data(train_size, path_to_clean_csv, code=''):

    train_size = int(train_size)
    result = {"status": "err", "message": "", "data": None}
    
    if not 0 < train_size < 100:
        result["message"] = "Error: Training size percentage must be between 0 and 100."
        return result

    try:
        df = pl.read_csv(path_to_clean_csv)
        target_column = read_target_var_label_enc_status(code)[0][1]

        X = df.drop([target_column])
        y = df[target_column]

        # Split the data
        train_size = train_size / 100
        X_train, X_test, y_train, y_test = train_test_split(X.to_pandas(), y.to_pandas(), train_size=train_size, random_state=42)

        X_train = pl.DataFrame(X_train)
        X_test = pl.DataFrame(X_test)
        y_train = pl.DataFrame(y_train)
        y_test = pl.DataFrame(y_test)

        intermediate_dir = f"intermediate/{code}"
        os.makedirs(intermediate_dir, exist_ok=True)

        # Save to CSV files
        train_df = pl.concat([X_train, y_train], how="horizontal")
        test_df = pl.concat([X_test, y_test], how="horizontal")
        train_df.write_csv(f"{intermediate_dir}/XY_train.csv")
        test_df.write_csv(f"{intermediate_dir}/XY_test.csv")

        result["status"] = "succ"
        result["message"] = "Data has been successfully split and saved to CSV files."
        return result

    except Exception as e:
        result["message"] = f"An error occurred: {e}"
        return result


In [None]:
def random_forest(df_train, df_test, n_estimators, max_depth, min_samples_split, code=""):
    n_estimators = int(n_estimators)
    if max_depth != "None":
        max_depth = int(max_depth)
    else:
        max_depth = None
    min_samples_split = int(min_samples_split)

    enco_status, target = read_target_var_label_enc_status(code)[0]

    try:
        if enco_status == 0:
            X_train = df_train.drop([target])
            y_train = df_train[target]

            X_test = df_test.drop([target])
            y_test = df_test[target]

            rf_clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)
            rf_clf.fit(X_train.to_numpy(), y_train.to_numpy())

            y_train_pred = pl.Series(rf_clf.predict(X_train.to_numpy()))
            y_test_pred = pl.Series(rf_clf.predict(X_test.to_numpy()))

            train_accuracy = accuracy_score(y_train, y_train_pred)
            test_accuracy = accuracy_score(y_test, y_test_pred)

            model_filename = f"output/{code}_random_forest.pkl"
            os.makedirs('output', exist_ok=True)
            joblib.dump(rf_clf, model_filename)

        else:
            le = LabelEncoder()
            df_train = df_train.with_column(pl.Series(le.fit_transform(df_train[target].to_numpy())).alias(target))
            df_test = df_test.with_column(pl.Series(le.transform(df_test[target].to_numpy())).alias(target))

            X_train = df_train.drop([target])
            y_train = df_train[target]
            X_test = df_test.drop([target])
            y_test = df_test[target]

            rl_clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)
            rl_clf.fit(X_train.to_numpy(), y_train.to_numpy())

            train_accuracy = accuracy_score(y_train, rl_clf.predict(X_train.to_numpy()))
            test_accuracy = accuracy_score(y_test, rl_clf.predict(X_test.to_numpy()))

            model_filename = f"output/{code}_enco_random_forest.pkl"
            os.makedirs('output', exist_ok=True)
            joblib.dump(rl_clf, model_filename)

        return {
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy
        }

    except Exception as e:
        return {
            'train_accuracy': None,
            'test_accuracy': None,
            'error': f"An error occurred: {e}"
        }
