In [None]:
!git clone https://github.com/broccubali/DLG-Assignments.git

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import torch
import networkx as nx

In [None]:
train_data = pd.read_csv('/kaggle/input/trainingdata/train_stock_data.csv')
train_data['Date'] = pd.to_datetime(train_data['Date'], format='%Y-%m-%d')
train_data.sort_values(['Ticker', 'Date'], inplace=True)

In [None]:
validation_data = pd.read_csv('/kaggle/input/validationdata/validation_stock_data.csv')
with open('/kaggle/input/hypergraph/hyperedges.json', 'r') as f:
    hyperedges = json.load(f)
with open('/kaggle/input/blindtesting/blind_test_cases.json', 'r') as f:
    test_cases = json.load(f)

In [None]:
print(f"Training data shape: {train_data.shape}")
print(f"Validation data shape: {validation_data.shape}")
print(f"Number of hyperedges: {len(hyperedges)}")
print(f"Number of test cases: {len(test_cases)}")
print("Training data columns:")
print(train_data.columns)

In [None]:
print("Hyperedges structure:")
print(hyperedges)

print(type(hyperedges))

print("Available tickers in training data:", train_data['Ticker'].unique())

print("Test case example:")
test_cases[0]

In [None]:
l = []
tickers = train_data['Ticker'].unique()
for ticker in tickers:
    data = train_data[train_data["Ticker"] == ticker]
    valid_cols = []
    for i in data.columns:
        if data[i].isna().sum() == 0:
            valid_cols.append(i)
    a = data[valid_cols].values
    for i in a:
        l.append(i)
train_df = pd.DataFrame(l, columns=["Date", "Ticker", "Open", "High", "Low", "Close", "Volume"])
train_df

In [None]:
ticker_to_sector = {ticker: sector for sector, tickers in hyperedges.items() for ticker in tickers}

In [None]:
train_df["Sector"] = train_df["Ticker"].map(ticker_to_sector)

# Sort by Ticker and Date to compute features
train_df = train_df.sort_values(["Ticker", "Date"])

# Create daily return
train_df["Return_1d"] = train_df.groupby("Ticker")["Open"].pct_change()

# Create Moving Averages (5-day, 10-day)
train_df["MA_5"] = train_df.groupby("Ticker")["Open"].transform(lambda x: x.rolling(5).mean())
train_df["MA_10"] = train_df.groupby("Ticker")["Open"].transform(lambda x: x.rolling(10).mean())

# Create Volatility (rolling std)
train_df["Volatility_5"] = train_df.groupby("Ticker")["Open"].transform(lambda x: x.rolling(5).std())

In [None]:
train_df.head(20)

In [None]:
sector_return = train_df.groupby(["Date", "Sector"])["Return_1d"].mean().reset_index()
sector_return.rename(columns={"Return_1d": "Sector_Avg_Return"}, inplace=True)

train_df = pd.merge(train_df, sector_return, on=["Date", "Sector"], how="left")

In [None]:
train_df["Target"] = train_df.groupby("Ticker")["Return_1d"].shift(-1)
train_df["Target"] = (train_df["Target"] > 0).astype(int)

In [None]:
features = ["Return_1d", "MA_5", "MA_10", "Volatility_5", "Sector_Avg_Return"]
train_df_model = train_df.dropna(subset=features + ["Target"])

X = train_df_model[features]
y = train_df_model["Target"]

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X, y)

In [None]:
train_df

In [None]:
# Step 1: Add sector to validation data
validation_data["Sector"] = validation_data["Ticker"].map(ticker_to_sector)

# Step 2: Sort for rolling features
validation_data = validation_data.sort_values(["Ticker", "Date"])

# Step 3: Compute features
validation_data["Return_1d"] = validation_data.groupby("Ticker")["Open"].pct_change()
validation_data["MA_5"] = validation_data.groupby("Ticker")["Open"].transform(lambda x: x.rolling(5).mean())
validation_data["MA_10"] = validation_data.groupby("Ticker")["Open"].transform(lambda x: x.rolling(10).mean())
validation_data["Volatility_5"] = validation_data.groupby("Ticker")["Open"].transform(lambda x: x.rolling(5).std())

# Step 4: Combine train + val for sector return calculation
combined = pd.concat([train_df, validation_data], axis=0)
sector_return = combined.groupby(["Date", "Sector"])["Return_1d"].mean().reset_index()
sector_return.rename(columns={"Return_1d": "Sector_Avg_Return"}, inplace=True)

# Step 5: Merge sector return into validation set
validation_data = pd.merge(validation_data, sector_return, on=["Date", "Sector"], how="left")

# Step 6: Set target (e.g., next day's Close price)
validation_data["Target"] = validation_data.groupby("Ticker")["Close"].shift(-1)

# Step 7: Drop NaNs
val_df_model = validation_data.dropna(subset=features + ["Target"])

In [None]:
validation_data

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train on train_data (you already processed `df`)
train_df_model = train_df.dropna(subset=features + ["Close"])  # or Target if predicting next-day

X_train = train_df_model[features]
y_train = train_df_model["Close"]  # or "Target"

X_val = val_df_model[features]
y_val = val_df_model["Target"]

regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)

# Predict on validation
y_pred = regressor.predict(X_val)

# Evaluate
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

rmse = mean_squared_error(y_val, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_val, y_pred)

print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAPE: {mape:.4f}")

In [None]:
# Convert test_cases into a DataFrame
test_df = pd.DataFrame(test_cases)
test_df["Date"] = pd.to_datetime(test_df["date"])
test_df["Ticker"] = test_df["ticker"]
test_df = test_df[["Date", "Ticker"]]

# Add sector info
test_df["Sector"] = test_df["Ticker"].map(ticker_to_sector)

# Combine train + validation to get all available history
full_history = pd.concat([train_data, validation_data])
full_history["Date"] = pd.to_datetime(full_history["Date"])
full_history = full_history.sort_values(["Ticker", "Date"])

# Add sector
full_history["Sector"] = full_history["Ticker"].map(ticker_to_sector)

# Feature engineering on full history
full_history["Return_1d"] = full_history.groupby("Ticker")["Open"].pct_change()
full_history["MA_5"] = full_history.groupby("Ticker")["Open"].transform(lambda x: x.rolling(5).mean())
full_history["MA_10"] = full_history.groupby("Ticker")["Open"].transform(lambda x: x.rolling(10).mean())
full_history["Volatility_5"] = full_history.groupby("Ticker")["Open"].transform(lambda x: x.rolling(5).std())

# Sector average returns
sector_return = full_history.groupby(["Date", "Sector"])["Return_1d"].mean().reset_index()
sector_return.rename(columns={"Return_1d": "Sector_Avg_Return"}, inplace=True)
full_history = pd.merge(full_history, sector_return, on=["Date", "Sector"], how="left")

In [None]:
test_df.rename(columns={
    'date': 'Date',
    'ticker': 'Ticker',
    'sector': 'Sector'
}, inplace=True)

In [None]:
# Ensure consistent column names
train_df.columns = train_df.columns.str.lower()
test_df.columns = test_df.columns.str.lower()
features = [f.lower() for f in features]

# Define function to get the most recent features from train_df
def get_latest_features(row):
    df = train_df[(train_df['ticker'] == row['ticker']) & (train_df['date'] < row['date'])]
    if df.empty:
        return pd.Series({f: np.nan for f in features})
    latest = df.iloc[-1]
    return latest.reindex(features)

# Apply the function to each row in test_df
test_features = test_df.apply(get_latest_features, axis=1)

# Concatenate the extracted features with the original test_df
test_ready = pd.concat([test_df, test_features], axis=1)

# Drop rows where we couldn't find historical features
test_ready = test_ready.dropna(subset=features)

In [32]:
# Define features in lowercase
features = ["return_1d", "ma_5", "ma_10", "volatility_5", "sector_avg_return"]

# Function to get latest available features from full history before the test date
def get_latest_features(row):
    df = full_history[(full_history["Ticker"] == row["Ticker"]) & (full_history["date"] < row["date"])]
    if df.empty:
        return pd.Series({f: np.nan for f in features})
    latest = df.iloc[-1]
    return latest[features]

# Apply feature extraction to test_df
test_features = test_df.apply(get_latest_features, axis=1)

# Combine features with original test set
test_ready = pd.concat([test_df, test_features], axis=1)

# Drop rows where any feature is missing
test_ready = test_ready.dropna(subset=features)

# --- Predict using the regressor ---

# If regressor was trained on capitalized feature names, fix this:
X_test = test_ready[features].copy()
X_test.columns = [f.title().replace("_", "_") if "_" in f else f.capitalize() for f in features]  # Match training time casing

# Predict
test_ready["predicted_close"] = regressor.predict(X_test)

KeyError: 'Ticker'

In [None]:
print("Test columns:", test_df.columns.tolist())