In [6]:
# """
# Main file for the BarModels directory
# """
#---------------------------- Imports basic -------------------------------
import sys
import os
import logging
import time

import pandas as pd
import numpy as np


import warnings
warnings.filterwarnings('ignore')
startTime = time.time()
root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__name__))))
              # root = '/home/labs/mayalab/barc/MSc_studies/ML_Project'
if root not in sys.path:
    sys.path.append(root)
else:
    pass

if __name__ == "__main__":
    here = '/home/labs/mayalab/barc/MSc_studies/ML_Project/pyScripts/BarModels'
else:
    here = os.path.dirname(os.path.abspath(__name__))

%reload_ext autoreload
%autoreload 2
#---------------------------- Imports for the model -------------------------------
from pyScripts.BarModels.Rendom_forest_BC import Rendom_forest_classification_BC_defultParams
from pyScripts.BarModels.Rendom_forest_BC import Rendom_forest_classification_BC_useingGridSearchCV
from pyScripts.BarModels.Rendom_forest_BC import Rendom_forest_classification_BC_useing_Optuna

#---------------------------- Logger -------------------------------
def initialize_logging():
    log_directory = os.path.join(here, "logs")
    if not os.path.exists(log_directory):
        os.makedirs(log_directory)
    logging.basicConfig(filename=os.path.join(log_directory, f"{time.strftime('%Y-%m-%d_%H-%M-%S')}.log"),
                         level=logging.INFO,
                         format="%(asctime)s - %(levelname)s - %(message)s")

def log_message(message):
    current_time = time.localtime() 
    time_string = time.strftime("%Y-%m-%d %H:%M:%S", current_time)
    logging.info(f"{time_string} - {message}")
    logging.getLogger().handlers[0].flush()

# Initialize logging
initialize_logging()
# Log a message
log_message(f" the logger is working, the time is: {time.time()-startTime}")
log_message(f"starting to load the data, the time is: {time.time()-startTime}")
# ---------------------------- data incoming -------------------------------
def load_data():
    # Load the data
    X_train = pd.read_csv(os.path.join(here, 'X_train_df.csv'))
    y_train = pd.read_csv(os.path.join(here, 'y_train.csv'))
    X_test = pd.read_csv(os.path.join(here, 'X_test_df.csv'))
    y_test = pd.read_csv(os.path.join(here, 'y_test.csv'))

    # Compare the columns of X_train and X_test
    train_cols = set(X_train.columns)
    test_cols = set(X_test.columns)
    extra_cols = train_cols - test_cols

    # Remove any identified extra columns from X_train
    if extra_cols:
        X_train = X_train.drop(columns=list(extra_cols))

    # Convert DataFrame to numpy arrays and flatten
    X_train_np = X_train.values  # Converts DataFrame to numpy array
    y_train_np = y_train.values.ravel()  # Converts DataFrame to numpy array and flattens it
    X_test_np = X_test.values  # Converts DataFrame to numpy array
    y_test_np = y_test.values.ravel()  # Converts DataFrame to numpy array and flattens it

    return X_train_np, y_train_np, X_test_np, y_test_np

X_train_np, y_train, X_test_np, y_test = load_data()
log_message(f"finished loading the data, the time is: {time.time()-startTime}")


In [2]:
# ---------------------------- Rendom_forest -------------------------------
log_message(f"starting to run the Rendom_forest model,the time is: {time.time()-startTime}")
def run_Rendom_forest():
    #create a Rendom_forest_classification_BC object using the default parameters
    rf = Rendom_forest_classification_BC_defultParams(X_train_np, y_train, X_test_np, y_test)
    #build the model
    classifier = rf.build_RandomForestClassifier()[0]
    classifier_fit = rf.build_RandomForestClassifier()[1]
    predictions_On_TrainDS = rf.predict_RandomForestClassifierTrainData(classifier_fit)
    predictions_On_TestDS = rf.predict_RandomForestClassifierTestData(classifier_fit)
    predictions_On_TrainDS_proba = rf.predict_RandomForestClassifierTrainData_proba(classifier_fit)
    predictions_On_TestDS_proba = rf.predict_RandomForestClassifierTestData_proba(classifier_fit)
    
    accuracy, f1_weighted, f1_binary = rf.accuracy_score(predictions_On_TrainDS, y_train)
    print(f"the results on the train data are: accuracy: {accuracy}, f1_weighted: {f1_weighted}, f1_binary: {f1_binary}")
    accuracy, f1_weighted, f1_binary = rf.accuracy_score(predictions_On_TestDS, y_test)
    print(f"the results on the test data are: accuracy: {accuracy}, f1_weighted: {f1_weighted}, f1_binary: {f1_binary}")


    log_loss, roc_auc = rf.accuracy_score_proba(predictions_On_TrainDS_proba, y_train)
    print(f"the results on the train data are: log_loss: {log_loss}, roc_auc: {roc_auc}")
    log_loss, roc_auc = rf.accuracy_score_proba(predictions_On_TestDS_proba, y_test)
    print(f"the results on the test data are: log_loss: {log_loss}, roc_auc: {roc_auc}")

    params = rf.get_params(classifier)

    confu = rf.make_confusion_matrix(predictions_On_TestDS, y_test)

    return classifier_fit, params , confu
log_message(f"finished running the Rendom_forest model,the time is: {time.time()-startTime}")
classifier_fit, params , confu = run_Rendom_forest()


the results on the train data are: accuracy: 0.9957926067465275, f1_weighted: 0.9957926986581138, f1_binary: 0.9957356473808682
the results on the test data are: accuracy: 0.8394422310756973, f1_weighted: 0.8292584113830209, f1_binary: 0.9111225464970963
the results on the train data are: log_loss: 0.06235000835838471, roc_auc: 0.999904797898002
the results on the test data are: log_loss: 1.0901229066331197, roc_auc: 0.5999700147288514


In [3]:
def run_Rendom_forest_with_GridSearchCV():
    #create a Rendom_forest_classification_BC instance using the GridSearchCV
    rf_GS_CV = Rendom_forest_classification_BC_useingGridSearchCV(X_train_np, y_train, X_test_np, y_test)
    
    #build the model using the GridSearchCV
    classifier, classifier_fit, best_rf_classifier = rf_GS_CV.build_RandomForestClassifierWithGridSearchCV()
    
    #update the parameter grid
    updated_param_grid = rf_GS_CV.update_parameter_grid()

    #get the best parameters
    best_params = rf_GS_CV.get_best_params()
    print(f"the best parameters are: {best_params}")

    predictions_On_TrainDS = rf_GS_CV.predict_RandomForestClassifier(y_train)[0]
    predictions_On_TestDS = rf_GS_CV.predict_RandomForestClassifier(y_test)[1]
    predictions_On_TrainDS_proba = rf_GS_CV.predict_RandomForestClassifier(y_train)[2]
    predictions_On_TestDS_proba = rf_GS_CV.predict_RandomForestClassifier(y_test)[3]

    accuracy, f1_weighted, f1_binary = rf_GS_CV.accuracy_score(predictions_On_TrainDS, y_train)[0:3]
    print(f"the results on the train data are: accuracy: {accuracy}, f1_weighted: {f1_weighted}, f1_binary: {f1_binary}")
    log_loss, roc_auc = rf_GS_CV.accuracy_score(predictions_On_TrainDS_proba, y_train)[3:5]
    print(f"the results on the train data are: log_loss: {log_loss}, roc_auc: {roc_auc}")
    
    accuracy, f1_weighted, f1_binary = rf_GS_CV.accuracy_score(predictions_On_TestDS, y_test)[0:3]
    print(f"the results on the test data are: accuracy: {accuracy}, f1_weighted: {f1_weighted}, f1_binary: {f1_binary}")
    log_loss, roc_auc = rf_GS_CV.accuracy_score(predictions_On_TestDS_proba, y_test)[3:5]
    print(f"the results on the test data are: log_loss: {log_loss}, roc_auc: {roc_auc}")
    
    params = rf_GS_CV.get_best_params()
    confu = rf_GS_CV.make_confusion_matrix(predictions_On_TestDS, y_test)

    
    return classifier_fit, best_rf_classifier , updated_param_grid
classifier_fit, best_rf_classifier , updated_param_grid , params , confu = run_Rendom_forest_with_GridSearchCV()


Fitting 3 folds for each of 2880 candidates, totalling 8640 fits


: 

In [3]:
def Runrf_using_Optuna():
    rf_model = Rendom_forest_classification_BC_useing_Optuna(X_train_np, y_train, X_test_np, y_test)
    best_params = rf_model.build_RandomForestClassifierWithOptuna()[2]
    best_classifier = rf_model.build_RandomForestClassifierWithOptuna()[0]
    rf_model.evaluate_model(best_classifier)

    return best_params , best_classifier , rf_model
best_params = Runrf_using_Optuna()

2024-04-19 23:41:21.438092: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[I 2024-04-19 23:41:22,870] A new study created in memory with name: no-name-14865ff7-df9d-4007-a87a-f4f0d8bc1d47
2024-04-19 23:41:22.860838: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46088 MB memory:  -> device: 0, name: NVIDIA A40, pci bus id: 0000:ce:00.0, compute capability: 8.6
2024-04-19 23:41:22.864237: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 46034 MB memory:  -> device: 1, name: NVIDIA A40, pci bus id: 0000:c9:00.0, compute capability: 8.6
[W 2024-04-19 23:41:22,872] Trial 0 failed w

ValueError: No trials are completed yet.

In [13]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
import tensorflow as tf

def objective(trial, X_train_np, y_train, X_test_np, y_test ):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    criterion = trial.suggest_categorical('criterion', ["squared_error", "absolute_error", "friedman_mse", "poisson"])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        criterion=criterion,
        random_state= 21
    )

    
    with tf.device('/device:GPU:0'):
        model.fit(X_train_np, y_train)
        y_pred = model.predict(X_test_np)

    # metric  to optimize
    score = mean_squared_error(y_test, y_pred)
    
    return score

study = optuna.create_study(direction='minimize', sampler=optuna.samplers.RandomSampler(seed=42))
# study.optimize(objective, n_trials=2)

# Print the best parameters found 
print("Best trial:")
trial = study.best_trial

print("Value: {:.4f}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))



[I 2024-04-19 23:51:09,593] A new study created in memory with name: no-name-2b31723a-447b-4cdf-b18c-b19bc61f129a


Best trial:


ValueError: No trials are completed yet.