## H2O Model Trainer
(30x rule - leader model metrics)

In this notebook, once data is entered, the 30x rule (num_rows=num_cols*30) is applied to the dataset to avoid overfitting or very large datasets. Then, H2OAutoML is trained with only these 3 algos: GBM, GLM and XGBoost. Once training is done, the leader model's metrics are extracted, if they are less than the threshold fixed, another H2OAutoML model is trained using DeepLearning algo. The best out of the two leaders is saved as the model to use.

In [1]:
!pip install fastapi nest-asyncio pyngrok uvicorn h2o

Collecting fastapi
  Downloading fastapi-0.111.1-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/92.2 kB[0m [31m617.1 kB/s[0m eta [36m0:00:00[0m
Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Collecting uvicorn
  Downloading uvicorn-0.30.1-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h2o
  Downloading h2o-3.46.0.4.tar.gz (265.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.3/265.3 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting starlette<0.38.0,>=0.37.2 (from fastapi)
  Downloading starlette-0.37.2-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi-cli>=0.0.2 (from fastapi)
  Downloading fasta

In [2]:
!ngrok authtoken '2ighL0YEwJxisFZFo8JWIFL1wtf_3CdEFhapKNHeoHFAE2m4d'

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [5]:
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
import numpy as np
from io import StringIO
import nest_asyncio
from pyngrok import ngrok
import uvicorn

In [6]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.23" 2024-04-16; OpenJDK Runtime Environment (build 11.0.23+9-post-Ubuntu-1ubuntu122.04.1); OpenJDK 64-Bit Server VM (build 11.0.23+9-post-Ubuntu-1ubuntu122.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpkvj41ga9
  JVM stdout: /tmp/tmpkvj41ga9/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpkvj41ga9/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.4
H2O_cluster_version_age:,9 days
H2O_cluster_name:,H2O_from_python_unknownUser_yv40m6
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [7]:
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

In [8]:
@app.get("/")
async def root():
    return "Hello World!"

In [9]:
def _30x_rule(df):
    num_rows, num_cols = df.shape
    max_rows = 30 * num_cols
    df_non_null = df.dropna()

    if len(df_non_null) > max_rows:
        df_non_null = df_non_null.sample(n=max_rows, random_state=42)
        print("num rows extracted (non-null): ", max_rows)
    else:
        print("num rows extracted (null): ",len(df_non_null))

    print(df_non_null)

    return df_non_null

In [10]:
@app.post('/train')
async def train_model(file: UploadFile = File(...)):
    csv_data = StringIO((await file.read()).decode('utf-8'))
    df = pd.read_csv(csv_data)
    df = _30x_rule(df)
    h2o_df = h2o.H2OFrame(df)
    x = h2o_df.columns
    y = x[-1]
    x.remove(y)

    prob_type = "regression"

    target_unique_values = h2o_df[y].unique().nrow
    if (h2o_df[y].isnumeric()[0] and target_unique_values < 10) or not h2o_df[y].isnumeric()[0]:
        prob_type = "classification"
        h2o_df[y] = h2o_df[y].asfactor()

    include_algos = ["GLM", "GBM", "XGBoost"]

    aml = H2OAutoML(max_models=10, seed=1, include_algos=include_algos)
    aml.train(x=x, y=y, training_frame=h2o_df)
    print(aml.leader)

    model = aml.leader
    model_metrics = model.model_performance()._metric_json

    include_dl = False
    if prob_type == "classification" :
      if float(model_metrics['AUC'])<0.9:
        include_dl = True
    elif float(model_metrics['r2'])<0.81: #basically r1<0.9
      include_dl = True

    if include_dl :
        print("DeepLearning included")
        aml2 = H2OAutoML(max_models=3, seed=1, include_algos=["DeepLearning"])
        aml2.train(x=x, y=y, training_frame=h2o_df)
        print(aml2.leader)
        model2 = aml2.leader
        if prob_type == "classification" :
          if float(model2.model_performance()._metric_json['AUC'])>float(model_metrics['AUC']):
            model = model2
        elif float(model2.model_performance()._metric_json['r2'])>float(model_metrics['r2']):
            model = model2

    model_path = h2o.save_model(model=model, path="./models", force=True)
    model_metrics = model.model_performance()._metric_json

    if prob_type == "classification" :
      model_details = {
          'model_id': model.model_id,
          'model_type': model.algo,
          'model_path': model_path,
          'model_category': model_metrics['model_category'],
          'AUC' : model_metrics['AUC'],
          'logloss' : model_metrics['logloss'],
          'MSE' : model_metrics['MSE'],
      }
    else :
      model_details = {
          'model_id': model.model_id,
          'model_type': model.algo,
          'model_path': model_path,
          'model_category': model_metrics['model_category'],
          'MSE' : model_metrics['MSE'],
          'RMSE' : model_metrics['RMSE'],
          'R2' : model_metrics['r2']
      }

    return JSONResponse(content={'modelpath': model_path, 'model_details': model_details})

In [11]:
@app.post('/predict')
async def predict_model(modelpath: str = Form(...), file: UploadFile = File(...)):
    csv_data = StringIO((await file.read()).decode('utf-8'))
    input_df = pd.read_csv(csv_data)
    h2o_input_df = h2o.H2OFrame(input_df)

    model = h2o.load_model(modelpath)

    predictions = model.predict(h2o_input_df)
    predictions_df = predictions.as_data_frame()

    return JSONResponse(content=predictions_df.to_dict(orient="records"))

In [None]:
ngrok_tunnel = ngrok.connect(8000)
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)

INFO:     Started server process [606]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Public URL: https://9769-104-154-114-175.ngrok-free.app
num rows extracted (non-null):  330
          age       sex       bmi        bp        s1        s2        s3  \
287  0.045341 -0.044642 -0.006206 -0.015999  0.125019  0.125198  0.019187   
211  0.092564 -0.044642  0.036907  0.021872 -0.024960 -0.016658  0.000779   
72   0.063504  0.050680 -0.004050 -0.012556  0.103003  0.048790  0.056003   
321  0.096197 -0.044642  0.051996  0.079265  0.054845  0.036577 -0.076536   
73   0.012648  0.050680 -0.020218 -0.002228  0.038334  0.053174 -0.006584   
..        ...       ...       ...       ...       ...       ...       ...   
267  0.059871 -0.044642 -0.000817 -0.084856  0.075484  0.079478  0.004460   
200  0.056239 -0.044642 -0.057941 -0.007977  0.052093  0.049103  0.056003   
394  0.034443 -0.044642  0.018584  0.056301  0.012191 -0.054549 -0.069172   
27  -0.023677 -0.044642  0.059541 -0.040099 -0.042848 -0.043589  0.011824   
383  0.005383  0.050680 -0.028840 -0.009113 -0.031840 -0.0288