# Models
This notebook focus on the h2o models. It reads preprocessed data from disk, and conduc furthe analysis.

In [None]:
import h2o
from h2o.estimators import H2OGeneralizedLinearEstimator, H2ORandomForestEstimator, H2ODeepLearningEstimator
from h2o.grid.grid_search import H2OGridSearch

# 初始化H2O
h2o.init()

# 加载数据
data = h2o.import_file("your_data.csv")

# 将数据分为训练集和测试集
train, test = data.split_frame(ratios=[0.8])

# 定义x（特征）和y（目标）
x = list(train.columns)
y = "target_column"
x.remove(y)

# 定义模型和超参数网格的列表
models_and_hyperparams = [
    {
        'model': H2OGeneralizedLinearEstimator,
        'hyper_params': {
            'alpha': [0.01, 0.1, 0.5, 0.9, 0.99],
            'lambda': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
        }
    },
    {
        'model': H2ORandomForestEstimator,
        'hyper_params': {
            'ntrees': [50, 100, 150],
            'max_depth': [3, 5, 7]
        }
    },
    {
        'model': H2ODeepLearningEstimator,
        'hyper_params': {
            'hidden': [[50, 50], [100, 100], [200, 200]],
            'epochs': [10, 50, 100]
        }
    }
]

# 为每个模型和超参数组合执行Grid Search
for model_info in models_and_hyperparams:
    model = model_info['model']
    hyper_params = model_info['hyper_params']

    # 初始化Grid Search
    grid = H2OGridSearch(
        model=model,
        hyper_params=hyper_params,
        search_criteria={"strategy": "Cartesian"}
    )

    # 训练多个模型
    grid.train(x=x, y=y, training_frame=train, validation_frame=test)

    # 获取Grid Search结果
    grid_results = grid.get_grid(sort_by="mse", decreasing=False)

    # 打印结果
    print(grid_results)

    # 将模型保存到磁盘
    for model_id in grid_results.model_ids:
        model_to_save = h2o.get_model(model_id)
        h2o.save_model(model=model_to_save, path="your_model_directory", force=True)


# Libraries

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import webbrowser
import pandas as pd
import importlib
import subprocess
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, roc_curve, auc,
                             precision_recall_curve, average_precision_score)
import h2o
from h2o.estimators import H2ORandomForestEstimator
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

from h2o.estimators import H2OGeneralizedLinearEstimator

import os
import hashlib

# Functions

In [None]:
def binary_classification_metrics(y_true, y_pred):
    """
    Calculate and print binary classification metrics: accuracy, confusion matrix, and classification report.
    
    :param y_true: A pandas DataFrame or Series containing the true target values.
    :param y_pred: A pandas DataFrame or Series containing the predicted target values.
    :return: A dictionary containing the calculated classification metrics.
    """
    # Calculate classification metrics
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['confusion_matrix'] = confusion_matrix(y_true, y_pred)
    metrics['classification_report'] = classification_report(y_true, y_pred)
    
    # Print the classification metrics
    print('Accuracy:', metrics['accuracy'])
    print('Confusion Matrix:\n', metrics['confusion_matrix'])
    print('Classification Report:\n', metrics['classification_report'])

    return metrics


In [None]:
def evaluate_predictions(y_true,
                         y_pred,
                         positive_class=1,
                         roc_file='roc_plot.html',
                         pr_file='pr_plot.html'):
    """
    Evaluate classification metrics, plot ROC curve and Precision-Recall curve using Plotly, and save plots as HTML files.

    :param y_true: A pandas DataFrame or Series containing the true target values.
    :param y_pred: A pandas DataFrame or Series containing the predicted target values.
    :param positive_class: The label of the positive class (default: 1).
    :param roc_file: The name of the HTML file to save the ROC curve plot (default: 'roc_plot.html').
    :param pr_file: The name of the HTML file to save the Precision-Recall curve plot (default: 'pr_plot.html').
    :return: A dictionary containing various classification metrics.
    """
    # Calculate ROC curve and AUC
    fpr, tpr, _ = roc_curve(y_true, y_pred, pos_label=positive_class)
    metrics['auc'] = auc(fpr, tpr)

    # Calculate Precision-Recall curve and average precision
    precision, recall, _ = precision_recall_curve(y_true,
                                                  y_pred,
                                                  pos_label=positive_class)
    metrics['average_precision'] = average_precision_score(y_true, y_pred)

    # Create ROC curve plot
    fig_roc = make_subplots(rows=1, cols=1, subplot_titles=('ROC Curve', ))
    fig_roc.add_trace(go.Scatter(x=fpr,
                                 y=tpr,
                                 mode='lines',
                                 name='ROC Curve',
                                 line=dict(color='blue')),
                      row=1,
                      col=1)
    fig_roc.add_trace(go.Scatter(x=[0, 1],
                                 y=[0, 1],
                                 mode='lines',
                                 name='Random',
                                 line=dict(color='black', dash='dash')),
                      row=1,
                      col=1)
    fig_roc.update_layout(title=f'ROC Curve (AUC = {metrics["auc"]:.4f})',
                          xaxis_title='False Positive Rate',
                          yaxis_title='True Positive Rate',
                          showlegend=True,
                          legend=dict(orientation='h',
                                      yanchor='bottom',
                                      xanchor='right',
                                      y=1.02,
                                      x=1))

    # Save the ROC curve plot as an HTML file
    pio.write_html(fig_roc, file=roc_file)

    # Create Precision-Recall curve plot
    fig_pr = make_subplots(rows=1,
                           cols=1,
                           subplot_titles=('Precision-Recall Curve', ))
    fig_pr.add_trace(go.Scatter(x=recall,
                                y=precision,
                                mode='lines',
                                name='Precision-Recall Curve',
                                line=dict(color='blue')),
                     row=1,
                     col=1)
    fig_pr.update_layout(
        title=
        f'Precision-Recall Curve (Avg. Precision = {metrics["average_precision"]:.4f})',
        xaxis_title='Recall',
        yaxis_title='Precision',
        showlegend=True,
        legend=dict(orientation='h',
                    yanchor='bottom',
                    xanchor='right',
                    y=1.02,
                    x=1))

    # Save the Precision-Recall curve plot as an HTML file
    pio.write_html(fig_pr, file=pr_file)

    # Print classification metrics nicely

    print(f'AUC: {metrics["auc"]:.4f}')
    print(f'Average Precision: {metrics["average_precision"]:.4f}')

    # Open the HTML files in the default web browser
    webbrowser.open(roc_file)
    webbrowser.open(pr_file)

    return metrics

In [None]:
def save_h2o_model_to_disk(model, directory, filename):
    """
    Given an H2O estimator object, a directory name, and a file name,
    saves the trained model to disk using the provided file name and directory.

    Args:
        model (H2OEstimator): The H2O estimator object to save.
        directory (str): The name of the directory to save the model in.
        filename (str): The name of the file to save the model in.

    Returns:
        str: The full path to the saved model.

    Example:
        >>> model = h2o.estimators.random_forest.H2ORandomForestEstimator(ntrees=50, max_depth=20, nfolds=10, seed=42)
        >>> data = h2o.import_file('example.csv')
        >>> predictors = ['a', 'b']
        >>> response = 'c'
        >>> model.train(x=predictors, y=response, training_frame=data)
        >>> directory = 'models'
        >>> filename = 'rf_model'
        >>> saved_model_path = save_h2o_model_to_disk(model, directory, filename)
        >>> loaded_model = h2o.load_model(saved_model_path)
    """
    # navigate up one level from cwd
    parent_dir = os.path.dirname(os.getcwd())

    # navigate into directory
    dir_path = os.path.join(parent_dir, directory)

    # create directory if it does not exist
    os.makedirs(dir_path, exist_ok=True)

    # merge directory path with file name
    file_path = os.path.join(dir_path, filename)

    # save model to file
    #model.save_model(file_path)
    h2o.save_model(model, file_path)
    return file_path


In [None]:
def load_h2o_model_from_disk(directory, filename):
    """
    Given a directory name and a file name, loads an H2O model from disk.

    Args:
        directory (str): The name of the directory where the model is saved.
        filename (str): The name of the file where the model is saved.

    Returns:
        H2OEstimator: The loaded H2O estimator object.

    Example:
        >>> directory = 'models'
        >>> filename = 'rf_model'
        >>> loaded_model = load_h2o_model_from_disk(directory, filename)
        >>> predictions = loaded_model.predict(data)
    """
    # Navigate up one level from cwd
    parent_dir = os.path.dirname(os.getcwd())

    # Navigate into directory
    dir_path = os.path.join(parent_dir, directory)

    # Merge directory path with file name
    file_path = os.path.join(dir_path, filename)

    # Load model from file
    loaded_model = h2o.load_model(file_path)
    return loaded_model


# Parameters 2: Parameters for model training

## Model directory

In [None]:
model_directory = "model"