# Universal Supply Chain ML Notebook

*Generated: 2025-11-12T12:12:29.324189 UTC*

This notebook automatically ingests **any CSV dataset**, detects a sensible target variable, preprocesses the data, trains a model (tree-based or neural net depending on size), evaluates it, and saves the model. Designed for Colab / Jupyter.

**How to use:** Upload a CSV via the file uploader cell, or mount Google Drive and set `DATA_PATH`.


In [1]:
# Install common packages (uncomment if needed in Colab)
# !pip install -q scikit-learn pandas matplotlib seaborn xgboost tensorflow joblib

import os, sys, math, random, warnings
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
warnings.filterwarnings('ignore')
print('Imports done.')

Imports done.


In [None]:
# ====== Dataset upload / path ======
# Option A: Use Colab file upload (works in Colab)
try:
    from google.colab import files
    uploaded = files.upload()
    if uploaded:
        DATA_PATH = list(uploaded.keys())[0]
        print('Uploaded:', DATA_PATH)
except Exception:
    DATA_PATH = None

# Option B: If running locally, set DATA_PATH to your CSV
if DATA_PATH is None:
    # Try common filenames in working dir
    candidates = [p for p in os.listdir('.') if p.lower().endswith('.csv')]
    if candidates:
        print('Found CSVs in working dir:', candidates)
        DATA_PATH = candidates[0]
    else:
        print('No uploaded CSV detected. Please set DATA_PATH = "your_file.csv" and run this cell again.')

DATA_PATH

In [None]:
# ====== Load dataset ======
assert DATA_PATH is not None, 'Set DATA_PATH to your CSV file path.'
df = pd.read_csv(DATA_PATH)
print('Loaded dataset with shape:', df.shape)
df.head()

In [None]:
# ====== Inspect and suggest target column ======
print('Columns:')
for i,c in enumerate(df.columns):
    print(i+1, c)

# Heuristics to guess target columns (numeric columns commonly used)
num_cols = df.select_dtypes(include=['number']).columns.tolist()
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
print('\nNumeric columns candidates (for regression targets):', num_cols)
print('Categorical columns candidates (for classification targets):', cat_cols)

# Common target name hints
hints = ['sales','demand','quantity','units','price','cost','lead_time','delay','on_time','profit','revenue','consumption']
possible_targets = [c for c in df.columns if any(h in c.lower() for h in hints)]
print('\nAuto-detected target candidates by name:', possible_targets)

# Choose target: prefer numeric hint, else pick largest variance numeric column
TARGET = None
if possible_targets:
    # pick numeric among them if possible
    for t in possible_targets:
        if t in num_cols:
            TARGET = t
            break
if TARGET is None and num_cols:
    # choose numeric column with highest variance (excluding id-like columns)
    variances = {c: df[c].var() for c in num_cols}
    sorted_vars = sorted(variances.items(), key=lambda x: x[1], reverse=True)
    TARGET = sorted_vars[0][0]

print('Suggested TARGET:', TARGET)

# If you want to override, set TARGET = 'your_column' and re-run this cell

In [None]:
# ====== Basic cleaning & preprocessing ======
# 1) Drop columns that are mostly unique ids or have too many missing values
THRESH_MISSING_RATIO = 0.6
missing_ratio = df.isna().mean()
drop_cols = missing_ratio[missing_ratio > THRESH_MISSING_RATIO].index.tolist()
print('Dropping high-missing columns:', drop_cols)
df = df.drop(columns=drop_cols)

# 2) Parse dates if any
for c in df.columns:
    if df[c].dtype == 'object':
        try:
            parsed = pd.to_datetime(df[c], errors='coerce')
            non_na = parsed.notna().sum()
            if non_na > len(df)*0.3:
                df[c+'_dt'] = parsed
                print('Parsed date column:', c, '->', c+'_dt')
        except Exception:
            pass

# 3) Add date features from any *_dt columns
date_cols = [c for c in df.columns if c.endswith('_dt')]
for c in date_cols:
    df[c+'_year'] = df[c].dt.year
    df[c+'_month'] = df[c].dt.month
    df[c+'_day'] = df[c].dt.day
    df[c+'_weekday'] = df[c].dt.weekday

# 4) Remove obvious ID columns (all unique values)
for c in df.columns:
    if df[c].nunique() == len(df):
        print('Dropping unique-id column:', c)
        df = df.drop(columns=[c])

print('Post-clean shape:', df.shape)
df.head()

In [None]:
# ====== Prepare X and y =====n
assert 'TARGET' in globals() and TARGET is not None, 'Set TARGET variable first.'
y = df[TARGET]
X = df.drop(columns=[TARGET])

# Keep numeric and categorical separately
num_features = X.select_dtypes(include=['number']).columns.tolist()
cat_features = X.select_dtypes(include=['object','category']).columns.tolist()
print('Numeric features:', num_features[:10])
print('Categorical features:', cat_features[:10])

# Simple frequency filter for high-cardinality categoricals
HIGH_CARD_THRESH = 100
cat_features = [c for c in cat_features if X[c].nunique() <= HIGH_CARD_THRESH]
print('Filtered categoricals (<=100 unique):', cat_features)

# Fill small missing values for numeric and categorical
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Pipelines
num_pipeline = Pipeline([('imputer', num_imputer), ('scaler', StandardScaler())])
cat_pipeline = Pipeline([('imputer', cat_imputer)])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
], remainder='drop')

print('Preprocessor ready.')

In [None]:
# ====== Train/Test split ======
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

# Fit preprocessor on train
preprocessor.fit(X_train)
X_train_trans = preprocessor.transform(X_train)
X_test_trans = preprocessor.transform(X_test)

# Decide model type: regression vs classification
is_regression = pd.api.types.is_numeric_dtype(y_train)
print('Problem type inferred:', 'Regression' if is_regression else 'Classification')

if is_regression:
    model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
    model.fit(X_train_trans, y_train)
    preds = model.predict(X_test_trans)
    print('MSE:', mean_squared_error(y_test, preds))
    print('R2:', r2_score(y_test, preds))
else:
    model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    model.fit(X_train_trans, y_train)
    preds = model.predict(X_test_trans)
    print('Accuracy:', accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))

# Save pipeline (preprocessor + model)
artifact_path = 'universal_supply_chain_model.joblib'
joblib.dump({'preprocessor': preprocessor, 'model': model}, artifact_path)
print('Saved model pipeline to', artifact_path)

In [None]:
# ====== Quick evaluation plots ======
try:
    if is_regression:
        plt.figure(figsize=(6,5))
        plt.scatter(y_test, preds, alpha=0.5)
        plt.xlabel('True')
        plt.ylabel('Predicted')
        plt.title('True vs Predicted')
        plt.grid(True)
        plt.show()

        residuals = y_test - preds
        plt.figure(figsize=(6,4))
        sns.histplot(residuals, kde=True)
        plt.title('Residuals')
        plt.show()
    else:
        # Confusion matrix
        from sklearn.metrics import ConfusionMatrixDisplay
        ConfusionMatrixDisplay.from_predictions(y_test, preds)
        plt.show()
except Exception as e:
    print('Plotting failed:', e)


In [None]:
# ====== Predict on new rows (example) ======
# Provide a small sample or load another CSV to predict on
PRED_PATH = None  # set to 'new_data.csv' to load
if PRED_PATH:
    new_df = pd.read_csv(PRED_PATH)
    pp = joblib.load(artifact_path)
    X_new = new_df[X.columns]  # ensure same columns
    X_new_trans = pp['preprocessor'].transform(X_new)
    new_preds = pp['model'].predict(X_new_trans)
    print('Predictions for new data:', new_preds[:10])
else:
    print('No PRED_PATH provided. To predict, set PRED_PATH to a new CSV file path and re-run.')


In [None]:
# ====== (Colab only) Download the trained model
try:
    from google.colab import files
    files.download(artifact_path)
except Exception:
    print('If running locally, the artifact is saved at', artifact_path)
