In [1]:
# """
# Main file for the BarModels directory
# """
#---------------------------- Imports basic -------------------------------
import sys
import os
import logging
import time

import pandas as pd
import numpy as np


import warnings
warnings.filterwarnings('ignore')
startTime = time.time()
root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__name__))))
              # root = '/home/labs/mayalab/barc/MSc_studies/ML_Project'
if root not in sys.path:
    sys.path.append(root)
else:
    pass

if __name__ == "__main__":
    here = '/home/labs/mayalab/barc/MSc_studies/ML_Project/pyScripts/BarModels'
else:
    here = os.path.dirname(os.path.abspath(__name__))

%reload_ext autoreload
%autoreload 2
#---------------------------- Imports for the model -------------------------------
from pyScripts.BarModels.Rendom_forest_BC import Rendom_forest_classification_BC_defultParams
from pyScripts.BarModels.Rendom_forest_BC import Rendom_forest_classification_BC_useingGridSearchCV
from pyScripts.BarModels.Rendom_forest_BC import Rendom_forest_classification_BC_useing_Optuna

#---------------------------- Logger -------------------------------
def initialize_logging():
    log_directory = os.path.join(here, "logs")
    if not os.path.exists(log_directory):
        os.makedirs(log_directory)
    logging.basicConfig(filename=os.path.join(log_directory, f"{time.strftime('%Y-%m-%d_%H-%M-%S')}.log"),
                         level=logging.INFO,
                         format="%(asctime)s - %(levelname)s - %(message)s")

def log_message(message):
    current_time = time.localtime() 
    time_string = time.strftime("%Y-%m-%d %H:%M:%S", current_time)
    logging.info(f"{time_string} - {message}")
    logging.getLogger().handlers[0].flush()

# Initialize logging
initialize_logging()
# Log a message
log_message(f" the logger is working, the time is: {time.time()-startTime}")
log_message(f"starting to load the data, the time is: {time.time()-startTime}")
# ---------------------------- data incoming -------------------------------
def load_data():
    # Load the data
    X_train = pd.read_csv(os.path.join(here, 'X_train_df.csv'))
    y_train = pd.read_csv(os.path.join(here, 'y_train.csv'))
    X_test = pd.read_csv(os.path.join(here, 'X_test_df.csv'))
    y_test = pd.read_csv(os.path.join(here, 'y_test.csv'))

    # Compare the columns of X_train and X_test
    train_cols = set(X_train.columns)
    test_cols = set(X_test.columns)
    extra_cols = train_cols - test_cols

    # Remove any identified extra columns from X_train
    if extra_cols:
        X_train = X_train.drop(columns=list(extra_cols))

    # Convert DataFrame to numpy arrays and flatten
    X_train_np = X_train.values  # Converts DataFrame to numpy array
    y_train_np = y_train.values.ravel()  # Converts DataFrame to numpy array and flattens it
    X_test_np = X_test.values  # Converts DataFrame to numpy array
    y_test_np = y_test.values.ravel()  # Converts DataFrame to numpy array and flattens it

    return X_train_np, y_train_np, X_test_np, y_test_np

X_train_np, y_train, X_test_np, y_test = load_data()
log_message(f"finished loading the data, the time is: {time.time()-startTime}")


In [2]:
# ---------------------------- Rendom_forest -------------------------------
log_message(f"starting to run the Rendom_forest model,the time is: {time.time()-startTime}")
def run_Rendom_forest():
    #create a Rendom_forest_classification_BC object using the default parameters
    rf = Rendom_forest_classification_BC_defultParams(X_train_np, y_train, X_test_np, y_test)
    #build the model
    classifier = rf.build_RandomForestClassifier()[0]
    classifier_fit = rf.build_RandomForestClassifier()[1]
    predictions_On_TrainDS = rf.predict_RandomForestClassifierTrainData(classifier_fit)
    predictions_On_TestDS = rf.predict_RandomForestClassifierTestData(classifier_fit)
    predictions_On_TrainDS_proba = rf.predict_RandomForestClassifierTrainData_proba(classifier_fit)
    predictions_On_TestDS_proba = rf.predict_RandomForestClassifierTestData_proba(classifier_fit)
    
    accuracy, f1_weighted, f1_binary = rf.accuracy_score(predictions_On_TrainDS, y_train)
    print(f"the results on the train data are: accuracy: {accuracy}, f1_weighted: {f1_weighted}, f1_binary: {f1_binary}")
    accuracy, f1_weighted, f1_binary = rf.accuracy_score(predictions_On_TestDS, y_test)
    print(f"the results on the test data are: accuracy: {accuracy}, f1_weighted: {f1_weighted}, f1_binary: {f1_binary}")


    log_loss, roc_auc = rf.accuracy_score_proba(predictions_On_TrainDS_proba, y_train)
    print(f"the results on the train data are: log_loss: {log_loss}, roc_auc: {roc_auc}")
    log_loss, roc_auc = rf.accuracy_score_proba(predictions_On_TestDS_proba, y_test)
    print(f"the results on the test data are: log_loss: {log_loss}, roc_auc: {roc_auc}")

    params = rf.get_params(classifier)

    confu = rf.make_confusion_matrix(predictions_On_TestDS, y_test)

    return classifier_fit, params , confu
log_message(f"finished running the Rendom_forest model,the time is: {time.time()-startTime}")
classifier_fit, params , confu = run_Rendom_forest()


the results on the train data are: accuracy: 0.9957926067465275, f1_weighted: 0.9957926986581138, f1_binary: 0.9957356473808682
the results on the test data are: accuracy: 0.8394422310756973, f1_weighted: 0.8292584113830209, f1_binary: 0.9111225464970963
the results on the train data are: log_loss: 0.06235000835838471, roc_auc: 0.999904797898002
the results on the test data are: log_loss: 1.0901229066331197, roc_auc: 0.5999700147288514


In [3]:
def run_Rendom_forest_with_GridSearchCV():
    #create a Rendom_forest_classification_BC instance using the GridSearchCV
    rf_GS_CV = Rendom_forest_classification_BC_useingGridSearchCV(X_train_np, y_train, X_test_np, y_test)
    
    #build the model using the GridSearchCV
    classifier, classifier_fit, best_rf_classifier = rf_GS_CV.build_RandomForestClassifierWithGridSearchCV()
    
    #update the parameter grid
    updated_param_grid = rf_GS_CV.update_parameter_grid()

    #get the best parameters
    best_params = rf_GS_CV.get_best_params()
    print(f"the best parameters are: {best_params}")

    predictions_On_TrainDS = rf_GS_CV.predict_RandomForestClassifier(y_train)[0]
    predictions_On_TestDS = rf_GS_CV.predict_RandomForestClassifier(y_test)[1]
    predictions_On_TrainDS_proba = rf_GS_CV.predict_RandomForestClassifier(y_train)[2]
    predictions_On_TestDS_proba = rf_GS_CV.predict_RandomForestClassifier(y_test)[3]

    accuracy, f1_weighted, f1_binary = rf_GS_CV.accuracy_score(predictions_On_TrainDS, y_train)[0:3]
    print(f"the results on the train data are: accuracy: {accuracy}, f1_weighted: {f1_weighted}, f1_binary: {f1_binary}")
    log_loss, roc_auc = rf_GS_CV.accuracy_score(predictions_On_TrainDS_proba, y_train)[3:5]
    print(f"the results on the train data are: log_loss: {log_loss}, roc_auc: {roc_auc}")
    
    accuracy, f1_weighted, f1_binary = rf_GS_CV.accuracy_score(predictions_On_TestDS, y_test)[0:3]
    print(f"the results on the test data are: accuracy: {accuracy}, f1_weighted: {f1_weighted}, f1_binary: {f1_binary}")
    log_loss, roc_auc = rf_GS_CV.accuracy_score(predictions_On_TestDS_proba, y_test)[3:5]
    print(f"the results on the test data are: log_loss: {log_loss}, roc_auc: {roc_auc}")
    
    params = rf_GS_CV.get_best_params()
    confu = rf_GS_CV.make_confusion_matrix(predictions_On_TestDS, y_test)

    
    return classifier_fit, best_rf_classifier , updated_param_grid
classifier_fit, best_rf_classifier , updated_param_grid , params , confu = run_Rendom_forest_with_GridSearchCV()


Fitting 3 folds for each of 2880 candidates, totalling 8640 fits


: 