In [4]:
import pandas as pd

In [26]:
#importing the data for prices
daily_corn = pd.read_csv("corn_prices_daily.csv", dtype={"date": "string"})
daily_corn["date"] = pd.to_datetime(daily_corn["date"].str.strip(), format="%d/%m/%Y")
daily_corn = daily_corn.sort_values("date").reset_index(drop=True)
#importing data for weather
spi_and_prob = pd.read_csv("spi6_and_probabilities.csv", parse_dates=["date"])


In [27]:
# =========================
# merge the two datasets based on dates from weekly drought probabilities, then check for missing
# values - if there is a missing values fill by last available date - the furthest back you can go is
# 4 days (data on Sunday never available so last date allowed is Wed), shwo in a table how many times you need fallback
# =========================
# weekly_probs_h1 : DataFrame with Sunday 'date' + state prob columns (e.g., p_state0_h1, ...)
# daily_corn      : DataFrame with business-day 'date' + price columns (e.g., 'close', ...)
# If your dates are index instead of a column, the helper below lifts them.

def ensure_date_col(df: pd.DataFrame, date_col: str = "date") -> pd.DataFrame:
    out = df.copy()
    if date_col not in out.columns and isinstance(out.index, pd.DatetimeIndex):
        out = out.reset_index().rename(columns={"index": date_col})
    out[date_col] = pd.to_datetime(out[date_col])
    return out

# Normalize inputs
wp = ensure_date_col(spi_and_prob, "date").sort_values("date")
dc = ensure_date_col(daily_corn, "date").sort_values("date")

# (Optional) deduplicate daily prices per date, keep the last record if duplicates exist
dc = dc.drop_duplicates(subset=["date"], keep="last")

# Identify price columns (everything except 'date')
price_cols = [c for c in dc.columns if c != "date"]
if not price_cols:
    raise ValueError("No price columns found in daily_corn (expected at least one non-'date' column).")

use_dc = dc[["date"] + price_cols].copy()

# =================================
# Dataset A: exact Friday (Sun-2d)
# =================================
A = wp.copy()
A["date_for_price2"] = A["date"] - pd.Timedelta(days=2)

use_dc2 = use_dc.rename(columns={"date": "date_for_price2"})
A = (
    pd.merge(A, use_dc2, on="date_for_price2", how="left")
      .sort_values("date")
      .reset_index(drop=True)
)

# IMPORTANT: record whether Friday had price BEFORE any fallback fill
A["fri_has_price"] = A[price_cols].notna().any(axis=1)

# Save A without helper cols (keep Sunday date + probs + prices)
A_out = A.drop(columns=["date_for_price2"])
A_out.to_csv("joined_sunday_minus2_exact.csv", index=False)

# ======================================================
# Dataset B: Friday or last available (≤ Sunday - 4 days)
# ======================================================
# A, price_cols, wp, use_dc already defined above

# As-of join: last available price at/before Sunday, but only within 4 days
use_dc_asof = use_dc.copy()
use_dc_asof["price_date"] = use_dc_asof["date"]

asof_tmp = pd.merge_asof(
    wp[["date"]].sort_values("date"),
    use_dc_asof.sort_values("date"),
    on="date",
    direction="backward",
    tolerance=pd.Timedelta(days=4)   # <- cap fallback to Sunday-4
)

# Attach as-of values & fill ONLY where Friday was missing
asof_fill = asof_tmp.rename(columns={c: f"{c}__asof" for c in price_cols})
asof_fill = asof_fill.rename(columns={"price_date": "price_date__asof"})

B = pd.merge(A, asof_fill[["date", "price_date__asof"] + [f"{c}__asof" for c in price_cols]],
             on="date", how="left")

for c in price_cols:
    # If Friday had price, keep it. Else use as-of (NaN if >4 days away)
    B[c] = B[c].where(B["fri_has_price"], B[f"{c}__asof"])

# Which date did we actually use?
B["price_date_used"] = pd.NaT
B.loc[B["fri_has_price"],  "price_date_used"] = B.loc[B["fri_has_price"],  "date_for_price2"]   # Fri (Sun-2)
B.loc[~B["fri_has_price"], "price_date_used"] = B.loc[~B["fri_has_price"], "price_date__asof"]  # as-of (≤4 days)

# Fallback days (NaN if no price within ≤4 days)
B["fallback_days"] = (B["date"] - B["price_date_used"]).dt.days

# Clean & save
drop_cols = [f"{c}__asof" for c in price_cols] + ["price_date__asof", "date_for_price2", "fri_has_price"]
B_out = (B.drop(columns=drop_cols)
           .sort_values("date")
           .reset_index(drop=True))
B_out.to_csv("joined_sunday_minus2_or_last_max4d.csv", index=False)

# Quick sanity: any rows beyond 4 days? (should be none due to tolerance)
print("Rows with fallback_days > 4:", int((B_out["fallback_days"] > 4).sum()))
print("Saved:", "joined_sunday_minus2_or_last_max4d.csv")

# ==========================
# NaN summary & comparisons
# ==========================
def nan_summary(df: pd.DataFrame, cols: list[str] | None = None) -> pd.DataFrame:
    if cols is None:
        cols = df.columns.tolist()
    nn = df[cols].isna().sum()
    pp = (df[cols].isna().mean() * 100).round(2)
    return pd.DataFrame({"n_nan": nn, "pct_nan": pp}).sort_values("n_nan", ascending=False)

print("NaN summary (ALL columns) — A (exact Fri):")
print(nan_summary(A_out), end="\n\n")

print("NaN summary (ALL columns) — B (fallback):")
print(nan_summary(B_out), end="\n\n")

# ======================
# fallback analysis table
# ======================
def fallback_counts_table(df: pd.DataFrame, col: str = "fallback_days",
                          values=(2, 3, 4)) -> pd.DataFrame:
    s = pd.to_numeric(df[col], errors="coerce")  # ensure numeric
    total_non_na = s.notna().sum()
    counts = (s[s.isin(values)]
              .value_counts()
              .reindex(values, fill_value=0)
              .rename("count"))
    out = counts.to_frame()
    out["pct"] = (out["count"] / total_non_na * 100).round(2)
    return out

# If your fallback dataset is B_out:
table = fallback_counts_table(B_out, "fallback_days", values=(2, 3, 4))
print(table)

Rows with fallback_days > 4: 0
Saved: joined_sunday_minus2_or_last_max4d.csv
NaN summary (ALL columns) — A (exact Fri):
               n_nan  pct_nan
Last Price        84     4.79
spi_6              0     0.00
date               0     0.00
p_state0_h1        0     0.00
p_state1_h1        0     0.00
p_state2_h1        0     0.00
fri_has_price      0     0.00

NaN summary (ALL columns) — B (fallback):
                 n_nan  pct_nan
price_date_used      2     0.11
fallback_days        2     0.11
Last Price           2     0.11
date                 0     0.00
p_state1_h1          0     0.00
p_state0_h1          0     0.00
spi_6                0     0.00
p_state2_h1          0     0.00

               count    pct
fallback_days              
2               1668  95.31
3                 62   3.54
4                 20   1.14


In [28]:
B_out["ret_1"] = B_out["Last Price"].pct_change()   # past return t-1 -> t
B_out = B_out[1:] # to remove the first row because ret has no value
B_out

  B_out["ret_1"] = B_out["Last Price"].pct_change()   # past return t-1 -> t


Unnamed: 0,date,spi_6,p_state0_h1,p_state1_h1,p_state2_h1,Last Price,price_date_used,fallback_days,ret_1
1,1992-01-12,-0.676211,0.188472,0.778493,0.033035,2.43,1992-01-10,2.0,0.016736
2,1992-01-19,-0.408163,0.042357,0.918067,0.039576,2.48,1992-01-17,2.0,0.020576
3,1992-01-26,-0.149762,0.031910,0.927547,0.040544,2.55,1992-01-24,2.0,0.028226
4,1992-02-02,-0.012210,0.031731,0.926709,0.041559,2.57,1992-01-31,2.0,0.007843
5,1992-02-09,-0.154111,0.031869,0.927580,0.040551,2.57,1992-02-07,2.0,0.000000
...,...,...,...,...,...,...,...,...,...
1747,2025-06-29,-0.317970,0.047144,0.919293,0.033563,4.00,2025-06-27,2.0,-0.014778
1748,2025-07-06,-0.346593,0.053727,0.912975,0.033298,4.10,2025-07-03,3.0,0.025000
1749,2025-07-13,-0.113659,0.039652,0.926234,0.034114,3.87,2025-07-11,2.0,-0.056098
1750,2025-07-20,0.110532,0.035815,0.928811,0.035374,3.95,2025-07-18,2.0,0.020672


In [None]:

#adf test for stationarity
result_adf = adfuller(daily_corn['Last Price'])
print('ADF Statistic:', result_adf[0])
print('p-value:', result_adf[1])

#jb test for normality
result_jb = stats.jarque_bera(daily_corn['Last Price'])
print('JB Statistic:', result_jb[0])
print('p-value:', result_jb[1])

#This notebook builds different modesl for predicting corn prices

The models are always a simple logistic regression and a LSTM, but the inputs change:
1.   Only price data
2.   Price and prcp data
3.   Price and SPI data
4.   Price, SPI and HMM data



##1. Only price data

###1.1 Logistic regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
# ========================
# Example synthetic dataset
# ========================
np.random.seed(42)
n = 200

# Two features
X = np.random.randn(n, 2)

# Binary target: 1 if x0 + x1 > 0, else 0
y = (X[:, 0] + X[:, 1] > 0).astype(int)

# ========================
# Train/test split
# ========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# ========================
# Fit logistic regression
# ========================
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# ========================
# Evaluate
# ========================
y_pred = logreg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))
