<a href="https://colab.research.google.com/github/alikaiser12/AI/blob/main/n8n_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# ================================
# One-Cell Colab Script (ERP/Alloys)
# ================================
# 1) Upload your CSV to Colab working dir as 'merged_alloy_features.csv'
#    or mount Drive and point DATA_CSV to its path.
# 2) Run this single cell.
# 3) Optional: set USE_API=True + NGROK_AUTH_TOKEN to expose an HTTP endpoint.
# --------------------------------

# (a) Imports & Config
import sys, os, math, json, textwrap
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import joblib

# Path to your data file in Colab:
DATA_CSV = '/content/drive/MyDrive/n8n/merged_alloy_features.csv'  # <- upload this file in Colab Files pane

# (b) Load data
df = pd.read_csv(DATA_CSV)

# Identify target and non-feature columns
TARGET = 'YS (MPa)'
non_feature_cols = ['ID', 'Alloy_x', 'Alloy_y', TARGET]

# Sanity checks
if TARGET not in df.columns:
    raise ValueError(f"Couldn't find target column '{TARGET}'. Found columns: {list(df.columns)}")

# Drop rows with NaN in the target column
df.dropna(subset=[TARGET], inplace=True)

# Select numeric feature columns (drop non-feature and anything non-numeric)
X_full = df.drop(columns=[c for c in non_feature_cols if c in df.columns], errors='ignore')
num_features = X_full.select_dtypes(include=[np.number]).columns.tolist()

if len(num_features) == 0:
    raise ValueError("No numeric features found to train on. Please ensure your CSV has numeric columns besides the target.")

y_full = df[TARGET].astype(float)

# (c) 90/10 split
X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
    X_full, y_full, df.index, test_size=0.10, random_state=42, shuffle=True
)

# (d) Build pipeline (median impute -> RandomForest)
preprocess = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), num_features),
    ],
    remainder='drop'
)

model = RandomForestRegressor(
    n_estimators=600,
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline(steps=[('prep', preprocess), ('rf', model)])

# (e) Train
pipe.fit(X_train, y_train)

# (f) Evaluate
y_pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
# Calculate RMSE manually for compatibility with older scikit-learn versions
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[Test Metrics]  MAE = {mae:.2f} MPa | RMSE = {rmse:.2f} MPa | R^2 = {r2:.3f}")

# (g) Save artifacts & splits
train_csv_path = '/content/drive/MyDrive/n8n/merged_alloy_features_train_90.csv'
test_csv_path  = '/content/drive/MyDrive/n8n/merged_alloy_features_test_10.csv'
df.loc[train_idx].to_csv(train_csv_path, index=False)
df.loc[test_idx].to_csv(test_csv_path, index=False)
joblib.dump(pipe, 'ys_rf_model.joblib')
print(f"Saved: {train_csv_path}, {test_csv_path}, ys_rf_model.joblib")

# (h) Helper: recommend compositions for target yield strength
def recommend_compositions_for_yield(target_ys, top_k=5):
    """
    Returns top_k rows from the dataset whose MODEL-PREDICTED yield strength
    is closest to target_ys. This is a practical inverse mapping by ranking
    candidates in your design set.
    """
    preds_all = pipe.predict(X_full[num_features])
    rec_df = pd.DataFrame({
        'idx': df.index,
        'Alloy': df['Alloy_x'] if 'Alloy_x' in df.columns else pd.Series(['']*len(df)),
        'Alloy_alt': df['Alloy_y'] if 'Alloy_y' in df.columns else pd.Series(['']*len(df)),
        'YS_actual': df[TARGET].values,
        'YS_pred': preds_all,
    })
    rec_df['abs_diff_to_target'] = np.abs(rec_df['YS_pred'] - float(target_ys))
    cols_to_show = ['idx', 'Alloy', 'Alloy_alt', 'YS_actual', 'YS_pred', 'abs_diff_to_target']
    return rec_df.sort_values('abs_diff_to_target').head(top_k)[cols_to_show]

# Example call (prints 5 nearest candidates for 900 MPa target)
print("\nExample recommendations for target 900 MPa:")
print(recommend_compositions_for_yield(900.0, top_k=5).to_string(index=False))

# (i) Optional: expose an API for n8n via FastAPI + ngrok
USE_API = False            # <- set True to run an HTTP API
NGROK_AUTH_TOKEN = ""      # <- paste your NGROK token if you have one (https://dashboard.ngrok.com/)

if USE_API:
    # Install libs only if using API
    import subprocess, sys
    subprocess.run([sys.executable, "-m", "pip", "install", "fastapi", "uvicorn", "pyngrok", "-q"], check=True)

    from fastapi import FastAPI
    from pydantic import BaseModel
    from pyngrok import ngrok
    import uvicorn
    import threading, time

    class Query(BaseModel):
        target_ys: float
        top_k: int = 5

    app = FastAPI()

    @app.get("/")
    def root():
        return {
            "status": "ok",
            "message": "POST /recommend with {'target_ys': float, 'top_k': int}"
        }

    @app.post("/recommend")
    def recommend(q: Query):
        result = recommend_compositions_for_yield(q.target_ys, q.top_k).to_dict(orient='records')
        return {"target_ys": q.target_ys, "recommendations": result}

    # Start ngrok tunnel
    if NGROK_AUTH_TOKEN:
        ngrok.set_auth_token(NGROK_AUTH_TOKEN)
    public_url = ngrok.connect(8000, "http")
    print("Public URL for API:", public_url)

    # Run Uvicorn server in the foreground (blocking)
    uvicorn.run(app, host="0.0.0.0", port=8000)

[Test Metrics]  MAE = 208.31 MPa | RMSE = 257.12 MPa | R^2 = 0.347
Saved: /content/drive/MyDrive/n8n/merged_alloy_features_train_90.csv, /content/drive/MyDrive/n8n/merged_alloy_features_test_10.csv, ys_rf_model.joblib

Example recommendations for target 900 MPa:
 idx                  Alloy           Alloy_alt  YS_actual    YS_pred  abs_diff_to_target
 139 Al1.125CuFe0.75NiTi1.1 AlCrFeNiMo0.5Ti0.75      618.8 902.411000            2.411000
 133             CrCuFeMoNi    Cu30Mn30Ni30Sn11      630.0 897.305000            2.695000
   1           CoFeNiSi0.25        Al0.75CoFeNi      794.0 939.460833           39.460833
  31     AlCo1.5CrFeMo0.5Ni     CoCrCuFeNiTi0.5      700.0 951.374333           51.374333
  61       Al0.75CoCrFeMnNi           AlMoTaTiV      735.0 954.726500           54.726500


In [None]:
# ================================
# One-Cell "Prompt Bot" for YS -> Composition Recommendations
# ================================
# What it does:
# - (Optional) Mount Google Drive
# - Load your dataset (expects 'YS (MPa)' target)
# - Train a RandomForestRegressor (with numeric feature preprocessing)
# - Provide a prompt-driven chat loop: ask "What is the chemical composition for 150 YS?"
# - Returns top-K candidate alloys/compositions whose predicted YS is closest to your target.
#
# Notes:
# - Mapping from a single property (YS) to a *unique* composition is under-determined.
#   This bot returns the best-matching candidates from your dataset according to the model.
# - You can change TOP_K, model hyperparams, or add filtering rules (e.g., only certain families).
# ================================

# ---------- User Configuration ----------
USE_DRIVE = False  # Set True if your CSV is in Google Drive
DATA_CSV  = "/content/drive/MyDrive/n8n/merged_alloy_features.csv"  # If USE_DRIVE is False, upload this file in Colab's Files pane.
TOP_K     = 5      # How many candidate compositions to return per question
RANDOM_STATE = 42  # For reproducibility

# ---------- Imports ----------
import re, textwrap, sys, math, json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ---------- (Optional) Mount Google Drive ----------
if USE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')
    # Example: DATA_CSV = "/content/drive/MyDrive/path/to/your/merged_alloy_features.csv"

# ---------- Load Data ----------
df = pd.read_csv(DATA_CSV)

# Required columns and basic sanity checks
TARGET = "YS (MPa)"
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found. Found columns: {list(df.columns)}")

# Drop rows with NaN in the target column
df.dropna(subset=[TARGET], inplace=True)

# Identify columns with alloy/composition labels (used for readable output)
alloy_cols = []
for c in ["Alloy_x", "Alloy_y", "Composition", "Alloy", "Formula"]:
    if c in df.columns:
        alloy_cols.append(c)
# If no human-readable composition column is found, we'll still proceed, but output will show indices.
if not alloy_cols:
    print("Warning: No obvious composition/alloy label column (e.g., 'Alloy_x') found. "
          "Results will include row indices only.")

# Build feature matrix: keep only numeric columns, excluding the target and clear non-features
non_feature_cols = set([TARGET, "ID"] + alloy_cols)
X_full = df.drop(columns=[c for c in df.columns if c in non_feature_cols], errors="ignore")
X_full = X_full.select_dtypes(include=[np.number])  # keep numeric features only
num_features = X_full.columns.tolist()
if len(num_features) == 0:
    raise ValueError("No numeric features available for training after excluding target/alloy columns.")

y_full = df[TARGET].astype(float)

# ---------- Train/Test split (90/10) ----------
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, test_size=0.10, random_state=RANDOM_STATE, shuffle=True
)

# ---------- Build model pipeline ----------
# - Median imputation for robustness
# - RandomForestRegressor as a strong baseline
preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_features),
    ],
    remainder="drop",
)

model = RandomForestRegressor(
    n_estimators=600,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

pipe = Pipeline(steps=[("prep", preprocess), ("rf", model)])

# Train
pipe.fit(X_train, y_train)

# Evaluate
y_pred = pipe.predict(X_test)
mae  = mean_absolute_error(y_test, y_pred)
# Calculate RMSE manually for compatibility with older scikit-learn versions
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)
print(f"[Model Performance on 10% Test]  MAE={mae:.2f} MPa | RMSE={rmse:.2f} MPa | R²={r2:.3f}")

# ---------- Helper: Recommend compositions for a target YS ----------
def recommend_compositions_for_yield(target_ys, top_k=TOP_K):
    """
    Returns top_k rows from the dataset whose MODEL-PREDICTED yield strength
    is closest to target_ys. This is a practical inverse mapping by ranking
    candidates already present in your data.
    """
    preds_all = pipe.predict(X_full[num_features])
    out = pd.DataFrame({
        "idx": df.index,
        "YS_actual": y_full.values,
        "YS_pred": preds_all,
        "abs_diff_to_target": np.abs(preds_all - float(target_ys)),
    })

    # Attach alloy/composition columns if present
    for c in alloy_cols:
        out[c] = df[c]

    # Sort by closeness to target
    out = out.sort_values("abs_diff_to_target").head(top_k)

    # Choose a neat set of columns to show
    nice_cols = []
    for c in ["idx"] + alloy_cols + ["YS_actual", "YS_pred", "abs_diff_to_target"]:
        if c in out.columns and c not in nice_cols:
            nice_cols.append(c)
    return out[nice_cols].reset_index(drop=True)

# ---------- Helper: Parse a natural-language prompt ----------
def parse_target_ys_from_text(text):
    """
    Extracts the *first* number found in the text as target YS (MPa).
    Accepts forms like '150', '150 MPa', 'YS=150', etc.
    Returns float or raises ValueError if not found.
    """
    match = re.search(r"([-+]?\d*\.?\d+)", text)
    if not match:
        raise ValueError("No numeric value found in the prompt.")
    return float(match.group(1))

# ---------- Helper: Compose a friendly answer string ----------
def format_recommendation_answer(target_ys, rec_df):
    """
    Builds a readable multi-line string summarizing top candidates.
    """
    lines = []
    lines.append(f"Target yield strength: {target_ys:.2f} MPa")
    if rec_df.empty:
        lines.append("No candidates found.")
        return "\n".join(lines)

    lines.append(f"Top {len(rec_df)} candidate composition(s) (closest predicted YS):")
    for i, row in rec_df.iterrows():
        # Primary label preference: Alloy_x, else first available alloy column, else index
        if "Alloy_x" in rec_df.columns and pd.notna(row.get("Alloy_x", "")) and str(row.get("Alloy_x", "")).strip():
            name = str(row["Alloy_x"])
        else:
            # look for the first non-empty among alloy_cols
            name = None
            for c in alloy_cols:
                val = row.get(c, "")
                if pd.notna(val) and str(val).strip():
                    name = str(val)
                    break
            if not name:
                name = f"Row #{int(row['idx'])}"

        ys_act  = row.get("YS_actual", np.nan)
        ys_pred = row.get("YS_pred", np.nan)
        diff    = row.get("abs_diff_to_target", np.nan)

        lines.append(f"  • {name}: predicted {ys_pred:.2f} MPa "
                     f"(actual {ys_act:.2f} MPa), Δ={diff:.2f} MPa")
    lines.append("\nNote: inverse design from a single property is under-determined; "
                 "this ranks *existing* dataset candidates by predicted closeness.")
    return "\n".join(lines)

# ---------- Chat loop (the "bot") ----------
# Type questions like:
#   "What is the chemical composition for 150 YS?"
#   "composition for 900"
#   "YS = 1200 MPa"
# Type 'quit' or 'exit' to stop.
print("\n--- YS→Composition Bot ---")
print("Ask me: e.g., 'What is the chemical composition for 150 YS?'\n(Type 'quit' to exit)\n")

while True:
    try:
        user_text = input("You: ").strip()
    except EOFError:
        break
    if not user_text:
        continue
    if user_text.lower() in {"quit", "exit", "q"}:
        print("Bot: Goodbye!")
        break

    try:
        target = parse_target_ys_from_text(user_text)
        rec = recommend_compositions_for_yield(target, top_k=TOP_K)
        answer = format_recommendation_answer(target, rec)
        print("Bot:\n" + answer + "\n")
    except Exception as e:
        print(f"Bot: Sorry, I couldn't process that. ({e})\n")

[Model Performance on 10% Test]  MAE=208.31 MPa | RMSE=257.12 MPa | R²=0.347

--- YS→Composition Bot ---
Ask me: e.g., 'What is the chemical composition for 150 YS?'
(Type 'quit' to exit)

Bot:
Target yield strength: 150.00 MPa
Top 5 candidate composition(s) (closest predicted YS):
  • Al0.56CoCrFeMnNi: predicted 732.87 MPa (actual 500.00 MPa), Δ=582.87 MPa
  • CoCuFeMnNiSn0.2: predicted 733.78 MPa (actual 515.00 MPa), Δ=583.78 MPa
  • Al0.8CoCrCuFe: predicted 748.85 MPa (actual 540.00 MPa), Δ=598.85 MPa
  • CrCuFeMoNi: predicted 897.30 MPa (actual 630.00 MPa), Δ=747.30 MPa
  • Al1.125CuFe0.75NiTi1.1: predicted 902.41 MPa (actual 618.80 MPa), Δ=752.41 MPa

Note: inverse design from a single property is under-determined; this ranks *existing* dataset candidates by predicted closeness.

