## H2O Model Trainer
(30x rule - categorical variables encoding - correlation matrix)

In this notebook, once data is entered, the 30x rule (num_rows=num_cols*30) is applied to the dataset to avoid overfitting or very large datasets, and categorical variables are converted to numeric, then correlation matrix is calculated. If the maximum value in the correlation matrix is less than the threshold fixed, then deeplearning is included besides GLM, GBM and XGBoost algos.

In [None]:
!pip install fastapi nest-asyncio pyngrok uvicorn h2o



In [None]:
!ngrok authtoken 'YOUR_NGROK_AUTH_TOKEN'

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
import numpy as np
from io import StringIO
import nest_asyncio
from pyngrok import ngrok
import uvicorn
import logging
import os

In [None]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,19 mins 32 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.4
H2O_cluster_version_age:,6 days
H2O_cluster_name:,H2O_from_python_unknownUser_h8wxq7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.040 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [None]:
# Initialize FastAPI app
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

In [None]:
@app.get("/")
async def root():
    return "Hello World!"

In [None]:
def determine_correlation(df, threshold=0.85):
    #numeric_columns = df.select_dtypes(include=['number']).columns

    #corr_matrix = df[numeric_columns].corr().abs()
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    max_corr = upper.max().max()
    print("max_corr : ",max_corr)

    return max_corr < threshold

In [None]:
def encode_ordered_categories(df):
    df_encoded = df.copy()

    for col in df.columns:
        if pd.api.types.is_object_dtype(df[col]):
            unique_values = df[col].unique()

            if all(isinstance(val, str) and val.isdigit() for val in unique_values):
                df_encoded[col] = pd.to_numeric(df[col], errors='coerce')
            else:
                order_mapping = {val: idx + 1 for idx, val in enumerate(unique_values)}
                df_encoded[col] = df[col].map(order_mapping)

    return df_encoded

In [None]:
def _30x_rule(df):
    num_rows, num_cols = df.shape
    max_rows = 30 * num_cols

    # rows with non-null values (prob with datatype if numeric, later in prediction)
    df_non_null = df.dropna()

    if len(df_non_null) > max_rows:
        df_non_null = df_non_null.sample(n=max_rows, random_state=42)
        print("num rows extracted (non-null): ", max_rows)
    else:
        print("num rows extracted (null): ",len(df_non_null))

    print(df_non_null)

    return df_non_null

In [None]:
@app.post('/train')
async def train_model(file: UploadFile = File(...)):
    csv_data = StringIO((await file.read()).decode('utf-8'))
    df = pd.read_csv(csv_data)
    df = _30x_rule(df)
    h2o_df = h2o.H2OFrame(df)
    x = h2o_df.columns
    y = x[-1]#"fetal_health"
    x.remove(y)

    prob_type = "regression"
    # for classification
    target_unique_values = h2o_df[y].unique().nrow
    if (h2o_df[y].isnumeric()[0] and target_unique_values < 10) or not h2o_df[y].isnumeric()[0]:
        prob_type = "classification"
        h2o_df[y] = h2o_df[y].asfactor()

    include_algos = ["GLM", "GBM", "XGBoost"] # same for both prob types (best)

    df_encoded = encode_ordered_categories(df[x])
    #h2o_df = h2o.H2OFrame(df_encoded)

    exclude_algos = ["StackedEnsemble"]
    include_deep_learning = determine_correlation(df_encoded[x])

    if include_deep_learning:
        #include_algos=["DeepLearning","GLM","GBM"]
        include_algos.append("DeepLearning")
        print("Deep learning included")
    else:
        print("Deep learning excluded")

    #aml = H2OAutoML(max_models=10, seed=1, exclude_algos=exclude_algos)
    aml = H2OAutoML(max_models=10, seed=1, include_algos=include_algos)
    #aml = H2OAutoML(max_models=10, seed=1, exclude_algos=["StackedEnsemble"])
    aml.train(x=x, y=y, training_frame=h2o_df)
    print(aml.leaderboard)

    model = aml.leader
    model_path = h2o.save_model(model=model, path="./models", force=True)

    model_metrics = model.model_performance()._metric_json

    if prob_type == "classification" :
      model_details = {
          'model_id': model.model_id,
          'model_type': model.algo,
          'model_path': model_path,
          #'model_metrics' : model_metrics,
          'model_category': model_metrics['model_category'],
          'AUC' : model_metrics['AUC'],
          'logloss' : model_metrics['logloss'],
          'MSE' : model_metrics['MSE'],
      }
    else : # regression
      model_details = {
          'model_id': model.model_id,
          'model_type': model.algo,
          'model_path': model_path,
          #'model_metrics' : model_metrics,
          'model_category': model_metrics['model_category'],
          'MSE' : model_metrics['MSE'],
          'RMSE' : model_metrics['RMSE'],
          #'MAE' : model_metrics['MAE'],
      }

    return JSONResponse(content={'modelpath': model_path, 'model_details': model_details})

In [None]:
@app.post('/predict')
async def predict_model(modelpath: str = Form(...), file: UploadFile = File(...)):
    csv_data = StringIO((await file.read()).decode('utf-8'))
    input_df = pd.read_csv(csv_data)
    h2o_input_df = h2o.H2OFrame(input_df)

    model = h2o.load_model(modelpath)

    predictions = model.predict(h2o_input_df)
    predictions_df = predictions.as_data_frame()

    return JSONResponse(content=predictions_df.to_dict(orient="records"))

In [None]:
# set up ngrok and run the app
ngrok_tunnel = ngrok.connect(8000)
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)

Public URL: https://3270-34-106-63-144.ngrok-free.app


INFO:     Started server process [5438]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


num rows extracted (non-null):  540
          id  age  gender driving_experience    education         income  \
3265  384446    2       1             20-29y   university    upper class   
603   456321    2       0               0-9y         none        poverty   
9998  903459    1       0             10-19y  high school        poverty   
9984  443302    1       0             10-19y  high school   middle class   
4695  371790    1       0             10-19y  high school   middle class   
...      ...  ...     ...                ...          ...            ...   
6161  823356    0       1               0-9y   university  working class   
3929   86349    0       0               0-9y         none   middle class   
9416  518999    3       1               30y+   university   middle class   
4268  432147    1       0             10-19y  high school   middle class   
7426  171829    1       1             10-19y         none        poverty   

      credit_score  vehicle_ownership vehicle_year 

ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/uvicorn/protocols/http/httptools_impl.py", line 399, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
  File "/usr/local/lib/python3.10/dist-packages/uvicorn/middleware/proxy_headers.py", line 70, in __call__
    return await self.app(scope, receive, send)
  File "/usr/local/lib/python3.10/dist-packages/fastapi/applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "/usr/local/lib/python3.10/dist-packages/starlette/applications.py", line 123, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 186, in __call__
    raise exc
  File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 164, in __call__
    await self.app(scope, receive, _send)
  File "/usr/local/lib/python3.10/dis

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
INFO:     197.25.187.211:0 - "POST /predict HTTP/1.1" 200 OK





num rows extracted (non-null):  60
         x           y
158  100.0   96.623279
500   97.0   94.296334
397   12.0   14.558961
155   86.0   86.821321
322   91.0   94.367790
212  100.0  100.015803
235   65.0   58.372660
290   25.0   25.041692
301   34.0   36.596231
357   22.0   21.107946
673   61.0   60.022749
329   22.0   18.562414
199   51.0   47.577536
78     5.0    4.928546
599    2.0   -0.121861
570   32.0   35.003411
447   25.0   25.537528
507    0.0    1.406254
627   35.0   32.649976
604   43.0   41.022518
361   99.0  100.095370
339   17.0   13.249916
669   24.0   19.426375
291    1.0    3.778210
285    5.0    1.628826
332   74.0   73.596434
478   29.0   26.655663
54    14.0   17.706776
249   35.0   36.780350
224   78.0   79.105063
133   68.0   64.057102
641   14.0   10.582140
136   79.0   81.222592
109   56.0   57.872192
181   52.0   52.953055
433   76.0   73.138500
555   43.0   44.707864
483   23.0   21.024694
517   91.0   91.170129
132   27.0   21.531400
176    2.0   -1.053276


