type:Markdown
# Supply Chain Disruption Early Warning System
This notebook will fetch GDELT data, build features, train a model and log results to DagsHub and Neon.

In [4]:
print("Notebook successfully created — ready for pipeline code!")



Notebook successfully created — ready for pipeline code!


In [5]:
! pip install pandas==2.2.3 sqlalchemy==2.0.35 psycopg2-binary==2.9.9 scikit-learn==1.7.2 mlflow==3.5.1 requests==2.32.3


Defaulting to user installation because normal site-packages is not writeable
Collecting pandas==2.2.3
  Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting sqlalchemy==2.0.35
  Downloading SQLAlchemy-2.0.35-cp312-cp312-win_amd64.whl.metadata (9.9 kB)
Collecting psycopg2-binary==2.9.9
  Downloading psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl.metadata (4.6 kB)
Collecting scikit-learn==1.7.2
  Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting mlflow==3.5.1
  Downloading mlflow-3.5.1-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.5.1 (from mlflow==3.5.1)
  Downloading mlflow_skinny-3.5.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.5.1 (from mlflow==3.5.1)
  Downloading mlflow_tracing-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow==3.5.1)
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting Flask<4 (from mlflow==3.5.1)
  Downloading flask

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.38.0 requires protobuf<6,>=3.20, but you have protobuf 6.33.0 which is incompatible.

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: C:\Users\SEC\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [24]:
import os
import pandas as pd
import numpy as np
import requests
from sqlalchemy import create_engine
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import average_precision_score, brier_score_loss
import mlflow

# ---- configuration ----
# Replace with your own Neon + DagsHub values
DATABASE_URL = "postgresql+psycopg2://neondb_owner:npg_k5JYXPoFzM1N@ep-wandering-salad-a1c7264o-pooler.ap-southeast-1.aws.neon.tech/neondb?sslmode=require"
MLFLOW_URI   = "https://dagshub.com/Yuvakrishna0/Supplychainprediction.mlflow/"              # from DagsHub

# local MLflow tracking (fallback)
os.environ["MLFLOW_TRACKING_URI"] = MLFLOW_URI


In [14]:
import pandas as pd, numpy as np
from datetime import date, timedelta

print("Generating local synthetic GDELT-like dataset...")

# make 30 days of events
dates = pd.date_range(date.today() - timedelta(days=30), periods=30)
countries = ["USA", "IND", "CHN", "DEU", "BRA", "GBR", "JPN", "AUS"]

gdelt = pd.DataFrame({
    "SQLDATE": [d.strftime("%Y%m%d") for d in dates],
    "AvgTone": np.random.normal(0, 3, len(dates)),       # sentiment tone
    "ActionGeo_CountryCode": np.random.choice(countries, len(dates))
})

gdelt["event_time"] = pd.to_datetime(gdelt["SQLDATE"], format="%Y%m%d")
print("✅ Synthetic data created:", gdelt.shape)
display(gdelt.head())


Generating local synthetic GDELT-like dataset...
✅ Synthetic data created: (30, 4)


Unnamed: 0,SQLDATE,AvgTone,ActionGeo_CountryCode,event_time
0,20251007,-2.751545,IND,2025-10-07
1,20251008,2.025211,CHN,2025-10-08
2,20251009,-0.341316,CHN,2025-10-09
3,20251010,1.932428,JPN,2025-10-10
4,20251011,-0.800994,CHN,2025-10-11


In [15]:
# aggregate tone by date
gdelt["ds"] = gdelt["event_time"].dt.date
features = (
    gdelt.groupby("ds", as_index=False)
         .agg(gdelt_tone_mean=("AvgTone","mean"),
              gdelt_tone_std=("AvgTone","std"))
)

# simulate a simple 'congestion' proxy
np.random.seed(42)
features["port_congestion_z"] = np.random.normal(0, 1, len(features))

# label: random high congestion as disruptions
threshold = 1.2
features["label_bottleneck_next_14d"] = (features["port_congestion_z"] > threshold).astype(int)

features.tail()


Unnamed: 0,ds,gdelt_tone_mean,gdelt_tone_std,port_congestion_z,label_bottleneck_next_14d
25,2025-11-01,-0.088389,,0.110923,0
26,2025-11-02,1.116156,,-1.150994,0
27,2025-11-03,4.328073,,0.375698,0
28,2025-11-04,-4.814605,,-0.600639,0
29,2025-11-05,2.998178,,-0.291694,0


In [21]:
mlflow.set_tracking_uri(MLFLOW_URI)
import mlflow
import os

# Your DagsHub credentials (replace with your actual username + token)
DAGSHUB_USERNAME = "Yuvakrishna0"  # <-- your DagsHub username
DAGSHUB_TOKEN = "b6507130bc00540377098c15b7a707b463896e35"  # from DagsHub settings

os.environ["MLFLOW_TRACKING_USERNAME"] = DAGSHUB_USERNAME
os.environ["MLFLOW_TRACKING_PASSWORD"] = DAGSHUB_TOKEN

# Now connect to your DagsHub MLflow server
mlflow.set_tracking_uri(f"https://dagshub.com/{DAGSHUB_USERNAME}/Supplychainprediction.mlflow")
mlflow.set_experiment("Supplychainprediction")

print("✅ Connected to DagsHub MLflow successfully.")


2025/11/06 19:14:29 INFO mlflow.tracking.fluent: Experiment with name 'Supplychainprediction' does not exist. Creating a new experiment.


✅ Connected to DagsHub MLflow successfully.


In [25]:
engine = create_engine(DATABASE_URL)
features.to_sql("features_supply_risk_daily", engine, if_exists="replace", index=False)
print("Uploaded features to Neon DB successfully ✅")


Uploaded features to Neon DB successfully ✅


In [27]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import average_precision_score, brier_score_loss
import numpy as np

X = features[["gdelt_tone_mean","gdelt_tone_std","port_congestion_z"]].values
y = features["label_bottleneck_next_14d"].values

tscv = TimeSeriesSplit(n_splits=3)
pr_aucs, briers = [], []

for tr, te in tscv.split(X):
    model = HistGradientBoostingClassifier(learning_rate=0.06)
    model.fit(X[tr], y[tr])
    p = model.predict_proba(X[te])[:,1]
    pr_aucs.append(average_precision_score(y[te], p))
    briers.append(brier_score_loss(y[te], p))

print("✅ Model trained successfully.")
print("Mean PR-AUC:", np.mean(pr_aucs))
print("Mean Brier score:", np.mean(briers))


✅ Model trained successfully.
Mean PR-AUC: 0.047619047619047616
Mean Brier score: 0.06305460190213918




In [28]:
p = model.predict_proba(X)[:,1]
out = pd.DataFrame({
    "ds": features["ds"],
    "risk_p": p,
    "expected_wait": np.clip(p*30, 4, 60)
})
out.to_sql("scores_daily", engine, if_exists="replace", index=False)
print("✅ Stored scores for dashboard!")


✅ Stored scores for dashboard!


In [29]:
pd.read_sql("SELECT * FROM scores_daily LIMIT 5;", engine)


Unnamed: 0,ds,risk_p,expected_wait
0,2025-10-07,0.130435,4.0
1,2025-10-08,0.130435,4.0
2,2025-10-09,0.130435,4.0
3,2025-10-10,0.130435,4.0
4,2025-10-11,0.130435,4.0


In [30]:
MLFLOW_URI="https://dagshub.com/Yuvakrishna0/Supplychainprediction.mlflow/"

In [32]:
from sqlalchemy import create_engine, text

DATABASE_URL = "postgresql+psycopg2://neondb_owner:npg_k5JYXPoFzM1N@ep-wandering-salad-a1c7264o-pooler.ap-southeast-1.aws.neon.tech/neondb?sslmode=require"

engine = create_engine(DATABASE_URL)

with engine.connect() as conn:
    result = conn.execute(text("SELECT NOW();"))
    print(result.fetchone())


(datetime.datetime(2025, 11, 8, 12, 28, 18, 95017, tzinfo=datetime.timezone.utc),)


In [33]:
out.to_sql("scores_daily", engine, if_exists="replace", index=False)
print("✅ Stored scores for dashboard!")


✅ Stored scores for dashboard!


In [None]:
out.to_sql("scores_daily", engine, if_exists="replace", index=False)
print("✅ Stored scores for dashboard!")


In [1]:
from sqlalchemy import create_engine, text

DATABASE_URL = "postgresql+psycopg2://neondb_owner:npg_k5JYXPoFzM1N@ep-wandering-salad-a1c7264o-pooler.ap-southeast-1.aws.neon.tech/neondb?sslmode=require"
engine = create_engine(DATABASE_URL)

with engine.connect() as conn:
    result = conn.execute(text("SELECT table_name FROM information_schema.tables WHERE table_schema='public';"))
    print([r[0] for r in result])


['features_supply_risk_daily', 'scores_daily']


In [2]:
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime, timedelta
import numpy as np

# Connect to your Neon database
DATABASE_URL = "postgresql+psycopg2://neondb_owner:npg_k5JYXPoFzM1N@ep-wandering-salad-a1c7264o-pooler.ap-southeast-1.aws.neon.tech/neondb?sslmode=require"
engine = create_engine(DATABASE_URL)

# --- Generate realistic sample data for last 14 days ---
today = datetime.utcnow().date()
countries = ["USA", "IND", "CHN", "DEU", "BRA", "GBR", "AUS", "ZAF", "CAN"]
rows = []

for c in countries:
    for d in range(14):
        rows.append({
            "ActionGeo_CountryCode": c,
            "event_time": today - timedelta(days=d),
            "AvgTone": np.random.normal(0, 3),  # geopolitical sentiment
        })

df = pd.DataFrame(rows)

# --- Write it to Neon ---
df.to_sql("raw_events", engine, if_exists="replace", index=False)
print(f"✅ raw_events table created with {len(df)} rows")


  today = datetime.utcnow().date()


✅ raw_events table created with 126 rows


In [3]:
from sqlalchemy import text

with engine.connect() as conn:
    tables = conn.execute(text("SELECT table_name FROM information_schema.tables WHERE table_schema='public';"))
    print([r[0] for r in tables])


['features_supply_risk_daily', 'scores_daily', 'raw_events']
