## H2O Model Trainer
(max rows limit/threshold - categorical variables encoding - correlation matrix)

In this notebook, once data is entered, the max rows threshold (limit_rows) is applied to the dataset to avoid overfitting or very large datasets, and categorical variables are converted to numeric (encode_ordered_categories), then correlation matrix is calculated. If the maximum value in the correlation matrix is less than the threshold fixed (determine_correlation), then deeplearning is included besides GLM, GBM and XGBoost algos.

In [None]:
!pip install fastapi nest-asyncio pyngrok uvicorn h2o

Collecting fastapi
  Downloading fastapi-0.111.1-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/92.2 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Collecting uvicorn
  Downloading uvicorn-0.30.1-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h2o
  Downloading h2o-3.46.0.4.tar.gz (265.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.3/265.3 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting starlette<0.38.0,>=0.37.2 (from fastapi)
  Downloading starlette-0.37.2-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi-cli>=0.0.2 (from fastapi)
  Downloading fastapi

In [None]:
!ngrok authtoken 'YOUR_NGROK_AUTH_TOKEN'

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
import numpy as np
from io import StringIO
import nest_asyncio
from pyngrok import ngrok
import uvicorn
import logging
import os

In [None]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.23" 2024-04-16; OpenJDK Runtime Environment (build 11.0.23+9-post-Ubuntu-1ubuntu122.04.1); OpenJDK 64-Bit Server VM (build 11.0.23+9-post-Ubuntu-1ubuntu122.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpbj02f94k
  JVM stdout: /tmp/tmpbj02f94k/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpbj02f94k/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,06 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.4
H2O_cluster_version_age:,8 days
H2O_cluster_name:,H2O_from_python_unknownUser_s7ib6t
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [None]:
# Initialize FastAPI app
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

In [None]:
@app.get("/")
async def root():
    return "Hello World!"

In [None]:
def determine_correlation(df, threshold=0.85):
    #numeric_columns = df.select_dtypes(include=['number']).columns

    #corr_matrix = df[numeric_columns].corr().abs()
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    max_corr = upper.max().max()
    print("max_corr : ",max_corr)

    return max_corr < threshold

In [None]:
def encode_ordered_categories(df):
    df_encoded = df.copy()

    for col in df.columns:
        if pd.api.types.is_object_dtype(df[col]):
            unique_values = df[col].unique()

            if all(isinstance(val, str) and val.isdigit() for val in unique_values):
                df_encoded[col] = pd.to_numeric(df[col], errors='coerce')
            else:
                order_mapping = {val: idx + 1 for idx, val in enumerate(unique_values)}
                df_encoded[col] = df[col].map(order_mapping)

    return df_encoded

In [None]:
def limit_rows(df, max_rows=500):
    num_rows, num_cols = df.shape
    df_non_null = df.dropna()

    if len(df_non_null) > max_rows:
        df_non_null = df_non_null.sample(n=max_rows, random_state=42)
        print("num rows extracted (non-null): ", max_rows)
    else:
        print("num rows: ",len(df_non_null))

    print(df_non_null)
    return df_non_null

In [None]:
@app.post('/train')
async def train_model(file: UploadFile = File(...)):
    csv_data = StringIO((await file.read()).decode('utf-8'))
    df = pd.read_csv(csv_data)
    df = limit_rows(df)
    h2o_df = h2o.H2OFrame(df)
    x = h2o_df.columns
    y = x[-1]
    x.remove(y)

    prob_type = "regression"
    # for classification
    target_unique_values = h2o_df[y].unique().nrow
    if (h2o_df[y].isnumeric()[0] and target_unique_values < 10) or not h2o_df[y].isnumeric()[0]:
        prob_type = "classification"
        h2o_df[y] = h2o_df[y].asfactor()

    include_algos = ["GLM", "GBM", "XGBoost"]

    df_encoded = encode_ordered_categories(df[x])

    exclude_algos = ["StackedEnsemble"]
    include_deep_learning = determine_correlation(df_encoded[x])

    if include_deep_learning:
        include_algos.append("DeepLearning")
        print("Deep learning included")
    else:
        print("Deep learning excluded")

    aml = H2OAutoML(max_models=10, seed=1, include_algos=include_algos)
    aml.train(x=x, y=y, training_frame=h2o_df)
    print(aml.leaderboard)

    model = aml.leader
    model_path = h2o.save_model(model=model, path="./models", force=True)

    model_metrics = model.model_performance()._metric_json

    if prob_type == "classification" :
      model_details = {
          'model_id': model.model_id,
          'model_type': model.algo,
          'model_path': model_path,
          'model_category': model_metrics['model_category'],
          'AUC' : model_metrics['AUC'],
          'logloss' : model_metrics['logloss'],
          'MSE' : model_metrics['MSE'],
      }
    else : # regression
      model_details = {
          'model_id': model.model_id,
          'model_type': model.algo,
          'model_path': model_path,
          'model_category': model_metrics['model_category'],
          'MSE' : model_metrics['MSE'],
          'RMSE' : model_metrics['RMSE'],
      }

    return JSONResponse(content={'modelpath': model_path, 'model_details': model_details})

In [None]:
@app.post('/predict')
async def predict_model(modelpath: str = Form(...), file: UploadFile = File(...)):
    csv_data = StringIO((await file.read()).decode('utf-8'))
    input_df = pd.read_csv(csv_data)
    h2o_input_df = h2o.H2OFrame(input_df)

    model = h2o.load_model(modelpath)

    predictions = model.predict(h2o_input_df)
    predictions_df = predictions.as_data_frame()

    return JSONResponse(content=predictions_df.to_dict(orient="records"))

In [None]:
# Set up ngrok and run the app
ngrok_tunnel = ngrok.connect(8000)
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)

Public URL: https://482c-35-197-51-101.ngrok-free.app


INFO:     Started server process [489]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


num rows extracted (non-null):  500
          id  age  gender driving_experience    education        income  \
3265  384446    2       1             20-29y   university   upper class   
603   456321    2       0               0-9y         none       poverty   
9998  903459    1       0             10-19y  high school       poverty   
9984  443302    1       0             10-19y  high school  middle class   
4695  371790    1       0             10-19y  high school  middle class   
...      ...  ...     ...                ...          ...           ...   
9465  487682    2       1             20-29y   university   upper class   
8814  783337    0       0               0-9y  high school  middle class   
493   881409    3       1               30y+   university   upper class   
8794  798069    2       1               0-9y   university   upper class   
3598  955335    0       1               0-9y         none       poverty   

      credit_score  vehicle_ownership vehicle_year  married  ch


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [489]


KeyboardInterrupt: 