In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
drive_folder="/content/drive/MyDrive/take3"

In [None]:
import pandas as pd
import yfinance as yf
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from tqdm import tqdm
import os
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/take3/ind_nifty500list_filtered_final.csv')
os.makedirs(f"{drive_folder}/stock_data", exist_ok=True)
os.makedirs(f"{drive_folder}/stock_data/train", exist_ok=True)
os.makedirs(f"{drive_folder}/stock_data/test", exist_ok=True)
os.makedirs(f"{drive_folder}/stock_data/val", exist_ok=True)

valid_tickers_file = f"{drive_folder}/valid_tickers.csv"
pkl_file_path = f"{drive_folder}/stock_data/nifty500_stock_data.pkl"

nifty500_stock_data = {}
valid_tickers = []
invalid_tickers = []

df_tickers = df['Symbol'] + '.NS'

if os.path.exists(pkl_file_path):
    print("Loading existing stock data from Google Drive...")
    with open(pkl_file_path, 'rb') as file:
        nifty500_stock_data = pickle.load(file)
    print("Stock data loaded successfully.")
else:
    print("No data found so Fetching new stock data...")
    ticker_data_lengths = {}

    for ticker in tqdm(df_tickers[:500], desc="Downloading "):
        try:
            stock = yf.Ticker(ticker)
            company_history = stock.history(start="2022-01-10", end="2025-01-11")

            if not company_history.empty:
                valid_tickers.append(ticker.replace('.NS', ''))
                if hasattr(company_history.index, "tz_localize"):
                    company_history.index = company_history.index.tz_localize(None)

                # Reset index and remove unwanted columns
                company_history = company_history.reset_index().drop(columns=['Dividends', 'Stock Splits'], errors='ignore')
                company_history = company_history.drop(columns=['Open', 'High', 'Low', 'Volume'], errors='ignore')
                ticker_data_lengths[ticker] = len(company_history)
                print(f"📊 {ticker} has {len(company_history)} days of data.")


                # Create labels
                company_history['Label'] = (company_history['Close'].shift(-1) > company_history['Close']).astype(int)
                company_history = company_history.dropna(subset=['Label'])

                # Feature Engineering
                company_history['Return'] = company_history['Close'].pct_change()
                company_history = company_history.fillna(0)

                # Normalize features
                price_features = ['Close', 'Return']

                scaler_price =  StandardScaler()
                company_history['nor_Close'] = scaler_price.fit_transform(company_history[['Close']])

                # Split into Train, Validation, Test (70%, 15%, 15%)
                total_required = 400 + 160 + 182  # 720 days
                if len(company_history) >= total_required:
                    company_history = company_history[-total_required:]  # take most recent 720 days

                    train_data = company_history[:400]
                    val_data = company_history[400:560]
                    test_data = company_history[560:]
                else:
                    print(f"❌ Not enough data for {ticker}. Only {len(company_history)} days available.")
                    invalid_tickers.append(ticker)
                    continue

                # Save in dictionary
                nifty500_stock_data[ticker] = {
                    'train': train_data,
                    'validation': val_data,
                    'test': test_data
                }

                # Save individual stock CSVs
                stock_csv_path_train = f"{drive_folder}/stock_data/train/{ticker.replace('.NS', '')}_train.csv"
                stock_csv_path_val = f"{drive_folder}/stock_data/val/{ticker.replace('.NS', '')}_val.csv"
                stock_csv_path_test = f"{drive_folder}/stock_data/test/{ticker.replace('.NS', '')}_test.csv"
                train_data.to_csv(stock_csv_path_train, index=False)
                val_data.to_csv(stock_csv_path_val, index=False)
                test_data.to_csv(stock_csv_path_test, index=False)
            else:
                invalid_tickers.append(ticker)
                print(f"No data available for {ticker}.")
        except Exception as e:
            invalid_tickers.append(ticker)
            print(f"Error fetching data for {ticker}: {e}")
# Save valid tickers list
valid_tickers_df = pd.DataFrame(valid_tickers, columns=['Symbol'])
valid_tickers_df.to_csv(valid_tickers_file, index=False)
# Save stock data dictionary
with open(pkl_file_path, 'wb') as file:
    pickle.dump(nifty500_stock_data, file)
print("\n✅ Stock data processing complete and saved to Google Drive.")
# Summary
print(f"✅ Valid tickers: {len(valid_tickers)}")
print(f"❌ Invalid tickers: {len(invalid_tickers)}")
print(f"❌ Invalid tickers list: {invalid_tickers}")

In [None]:
from torch.utils.data import Dataset, DataLoader

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define sequence length
SEQ_LENGTH = 30

# Paths for datasets
drive_folder = "/content/drive/MyDrive/take3"
train_folder = f"{drive_folder}/stock_data/train"
val_folder = f"{drive_folder}/stock_data/val"
test_folder = f"{drive_folder}/stock_data/test"

# Paths to save embeddings
train_embeddings_path = f"{drive_folder}/LSTM_embeddings/lstm_train_embeddings.pkl"
val_embeddings_path = f"{drive_folder}/LSTM_embeddings/lstm_val_embeddings.pkl"
test_embeddings_path = f"{drive_folder}/LSTM_embeddings/lstm_test_embeddings.pkl"

# Load Stock Data
def load_stock_data(folder_path):
    stock_data = {}
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            stock_symbol = file_name.replace('.csv', '')
            stock_df = pd.read_csv(os.path.join(folder_path, file_name))
            stock_data[stock_symbol] = stock_df
    return stock_data

train_data = load_stock_data(train_folder)
val_data = load_stock_data(val_folder)
test_data = load_stock_data(test_folder)

class StockDataset(Dataset):
    def __init__(self, stock_data):
        self.data = []
        for ticker, data in stock_data.items():
            prices = data['nor_Close'].values
            for i in range(len(prices) - SEQ_LENGTH):
                x_seq = prices[i:i + SEQ_LENGTH]
                y_seq = prices[i + SEQ_LENGTH]
                self.data.append((x_seq, y_seq))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x_seq, y_seq = self.data[idx]
        return torch.tensor(x_seq, dtype=torch.float32), torch.tensor(y_seq, dtype=torch.float32)

# Create dataset and dataloaders
train_dataset = StockDataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataset = StockDataset(val_data)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define LSTM Model
class StockLSTM(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, num_layers=2):
        super(StockLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # lstm_out shape: [batch, seq_len, hidden_size]
        # h_n shape: [num_layers, batch, hidden_size]
        lstm_out, (h_n, c_n) = self.lstm(x)

        # For price prediction (still needed for training)
        stock_price = self.fc(h_n[-1])

        return stock_price, h_n

    def get_embedding(self, x):
        # This function is specifically for getting embeddings
        with torch.no_grad():
            _, (h_n, _) = self.lstm(x)
            # Return the hidden state from the last layer
            return h_n[-1]  # Shape: [batch, hidden_size]

model = StockLSTM().to(device)

# Training Setup
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

from tqdm import tqdm  # Import tqdm
EPOCHS = 10
best_val_loss = float('inf')  # Track best validation loss
patience = 3  # Number of epochs to wait before stopping
no_improve_epochs = 0  # Counter for early stopping

# Use tqdm for tracking epochs
epoch_progress = tqdm(range(EPOCHS), desc="Training Progress", unit="epoch")

for epoch in epoch_progress:
    model.train()
    total_loss = 0

    for x_seq, y_seq in train_dataloader:
        x_seq, y_seq = x_seq.unsqueeze(-1).to(device), y_seq.to(device)
        optimizer.zero_grad()
        output, _= model(x_seq)
        loss = criterion(output, y_seq.unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    # ---------------- Validate the Model ----------------
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x_seq, y_seq in val_dataloader:
            x_seq, y_seq = x_seq.unsqueeze(-1).to(device), y_seq.to(device)
            output,_ = model(x_seq)
            loss = criterion(output, y_seq.unsqueeze(1))
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)

    # Update tqdm description for epoch progress
    epoch_progress.set_postfix(train_loss=avg_train_loss, val_loss=avg_val_loss)

    print(f"Epoch [{epoch+1}/{EPOCHS}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Check for best model and early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        no_improve_epochs = 0  # Reset counter
        torch.save(model.state_dict(), f"{drive_folder}/best_lstm_model.pth")
        print("✅ Best model saved!")
    else:
        no_improve_epochs += 1

    if no_improve_epochs >= patience:
        print("⏹️ Early stopping triggered!")
        break  # Stop training if validation loss doesn't improve

print("🎯 Training completed! Loading the best model...")

# Load Best Model Before Generating Embeddings
best_model = StockLSTM().to(device)

best_model.load_state_dict(torch.load(f"{drive_folder}/best_lstm_model.pth"))
best_model.eval()
def generate_embeddings(stock_data, model, save_path, shift_last_day=False):
    lstm_embeddings = {}

    with tqdm(stock_data.items(), total=len(stock_data), desc="Generating Embeddings") as t:
        for ticker, data in t:
            # Get normalized closing prices
            prices = data['nor_Close'].values

            if shift_last_day:
                # Use the second-last day as the target
                if len(prices) < (SEQ_LENGTH + 1):
                    continue  # skip stocks with too few days
                seq = prices[-(SEQ_LENGTH + 1):-1]
            else:
                if len(prices) < SEQ_LENGTH:
                    continue
                seq = prices[-SEQ_LENGTH:]

            # Convert to tensor
            stock_tensor = torch.tensor(seq, dtype=torch.float32).unsqueeze(0).unsqueeze(-1).to(device)

            # Get embedding (hidden state)
            lstm_embedding = model.get_embedding(stock_tensor).squeeze().cpu().numpy()

            # Save embedding
            lstm_embeddings[ticker] = lstm_embedding

    # Save embeddings to file
    with open(save_path, 'wb') as file:
        pickle.dump(lstm_embeddings, file)
    print(f"✅ LSTM embeddings saved to {save_path}")
print("\n🔹 Generating LSTM embeddings for Train set...")
generate_embeddings(train_data, best_model, train_embeddings_path)
print("\n🔹 Generating LSTM embeddings for Validation set...")
generate_embeddings(val_data, best_model, val_embeddings_path)
print("\n🔹 Generating LSTM embeddings for Test set (shifted)...")
generate_embeddings(test_data, best_model, test_embeddings_path, shift_last_day=True)

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ========== SET SEED ==========
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(1)

# ========== CONFIG ==========
drive_path = "/content/drive/MyDrive/take3"
embedding_dir = os.path.join(drive_path, "LSTM_embeddings")
label_dir = os.path.join(drive_path, "stock_data")
model_path = os.path.join(drive_path, "mlp_regressor_best_model.pth")

embedding_paths = {
    "train": os.path.join(embedding_dir, "lstm_train_embeddings.pkl"),
    "val": os.path.join(embedding_dir, "lstm_val_embeddings.pkl"),
    "test": os.path.join(embedding_dir, "lstm_test_embeddings.pkl"),
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ========== DATASET ==========
class EmbeddingRegressionDataset(Dataset):
    def __init__(self, embeddings_dict, split):
        self.samples = []
        for stock, emb in embeddings_dict.items():
            stock_name = stock.replace(f"_{split}", "")
            csv_path = os.path.join(label_dir, split, f"{stock_name}_{split}.csv")
            try:
                df = pd.read_csv(csv_path)
                ret = df["Return"].values[-2]
                label = df["Label"].values[-2]
                self.samples.append((torch.tensor(emb, dtype=torch.float32), ret, label, stock))
            except Exception as e:
                print(f"Skipping {stock} due to missing data or error: {e}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x, y, label, stock = self.samples[idx]
        return x, torch.tensor(y, dtype=torch.float32), label, stock

# ========== MODEL ==========
class MLPRegressor(nn.Module):
    def __init__(self, input_dim=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze()

# ========== LOAD EMBEDDINGS ==========
def load_embeddings(path):
    with open(path, "rb") as f:
        return pickle.load(f)

# ========== TRAIN MODEL ==========
def train_regressor(train_loader, val_loader):
    model = MLPRegressor().to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    best_model = None
    best_val_loss = float("inf")

    for epoch in range(50):
        model.train()
        total_loss = 0
        for x, y, _, _ in train_loader:
            x, y = x.to(device), y.to(device)
            pred = model(x)
            loss = criterion(pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x, y, _, _ in val_loader:
                x, y = x.to(device), y.to(device)
                pred = model(x)
                loss = criterion(pred, y)
                val_loss += loss.item()
        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.6f}, Val Loss = {avg_val_loss:.6f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model = model.state_dict()

    # Save best model
    torch.save(best_model, model_path)
    print(f"✅ Best model saved at: {model_path}")
    model.load_state_dict(best_model)
    return model

# ========== EVALUATE TOP-K ==========
def evaluate_topk(model, test_loader, k_values=[5, 10, 15, 20]):
    model.eval()
    all_preds, all_returns, all_labels, all_stocks = [], [], [], []

    with torch.no_grad():
        for x, y, label, stock in test_loader:
            x = x.to(device)
            pred = model(x).cpu().numpy()
            y = y.numpy()
            all_preds.extend(pred)
            all_returns.extend(y)
            all_labels.extend(label)
            all_stocks.extend(stock)

    all_preds = np.array(all_preds)
    all_returns = np.array(all_returns)
    true_labels = np.array(all_labels)

    pred_rank = np.argsort(-all_preds)
    return_rank = np.argsort(-all_returns)

    stock_to_pred_rank = {all_stocks[i]: rank + 1 for rank, i in enumerate(pred_rank)}
    stock_to_return_rank = {all_stocks[i]: rank + 1 for rank, i in enumerate(return_rank)}

    print("\n📊 Top-K Evaluation:\n")
    results = {}

    for k in k_values:
        topk_pred_idx = pred_rank[:k]
        topk_true_idx = return_rank[:k]

        topk_pred_stocks = [all_stocks[i] for i in topk_pred_idx]
        topk_true_stocks = [all_stocks[i] for i in topk_true_idx]

        acc = np.mean(true_labels[topk_pred_idx] == 1)
        irr = np.sum(all_returns[topk_pred_idx]) - np.sum(all_preds[topk_pred_idx])
        mae = np.mean(np.abs(all_returns[topk_pred_idx] - all_preds[topk_pred_idx]))
        mrr = np.sum([1 / stock_to_pred_rank[stock] for stock in topk_true_stocks]) / k
        precision = len(set(topk_pred_stocks) & set(topk_true_stocks)) / k

        # Store metrics
        results[k] = {
            "accuracy": acc,
            "precision": precision,
            "mae": mae,
            "irr": irr,
            "mrr": mrr
        }

        # Print top-k stocks table first
        print(f"🔝 Top-{k} Stocks:")
        print("Rank\tStock\t\tLabel\tReturn\tPredicted\tTrue Return Rank")
        for rank, idx in enumerate(topk_pred_idx):
            print(f"{rank+1}\t{all_stocks[idx]}\t{true_labels[idx]}\t{all_returns[idx]:.4f}\t{all_preds[idx]:.4f}\t\t\t{stock_to_return_rank[all_stocks[idx]]}")
        print("-" * 80)

    # Print metrics summary
    print("\n📈 Summary Metrics:")
    for k in k_values:
        m = results[k]
        print(f"Top-{k}: Accuracy={m['accuracy']:.4f}, Precision={m['precision']:.4f}, "
              f"MRR={m['mrr']:.4f}, IRR={m['irr']:.4f}, MAE={m['mae']:.4f}")

# ========== MAIN ==========
if __name__ == "__main__":
    train_emb = load_embeddings(embedding_paths["train"])
    val_emb = load_embeddings(embedding_paths["val"])
    test_emb = load_embeddings(embedding_paths["test"])

    train_set = EmbeddingRegressionDataset(train_emb, "train")
    val_set = EmbeddingRegressionDataset(val_emb, "val")
    test_set = EmbeddingRegressionDataset(test_emb, "test")

    train_loader = DataLoader(train_set, batch_size=64, shuffle=False)
    val_loader = DataLoader(val_set, batch_size=64, shuffle=False)
    test_loader = DataLoader(test_set, batch_size=64, shuffle=False)

    if os.path.exists(model_path):
        print("📦 Loading saved MLP Regressor...")
        model = MLPRegressor().to(device)
        model.load_state_dict(torch.load(model_path))
        model.eval()
    else:
        print("🚀 Training MLP Regressor...")
        model = train_regressor(train_loader, val_loader)

    print("🧪 Evaluating on Test Set...")
    evaluate_topk(model, test_loader)

In [None]:
def export_all_predictions(model, test_loader, save_path):
    model.eval()
    all_preds, all_returns, all_labels, all_stocks = [], [], [], []

    with torch.no_grad():
        for x, y, label, stock in test_loader:
            x = x.to(device)
            pred = model(x).cpu().numpy()
            y = y.numpy()
            all_preds.extend(pred)
            all_returns.extend(y)
            all_labels.extend(label)
            all_stocks.extend(stock)

    df = pd.DataFrame({
        "Stock": all_stocks,
        "TrueReturn": all_returns,
        "PredictedReturn": all_preds,
        "Label": all_labels
    })

    # Sort by predicted return (descending)
    df_sorted = df.sort_values(by="PredictedReturn", ascending=False).reset_index(drop=True)

    # Save to CSV
    df_sorted.to_csv(save_path, index=False)
    print(f"✅ Full prediction ranking saved to: {save_path}")

# Run it
csv_output_path = "/content/drive/MyDrive/take3/mlp_all_predictions.csv"
export_all_predictions(model, test_loader, csv_output_path)

In [None]:
import os
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw

# Define parameters
SEQ_LENGTH = 6
WINDOW_SIZE = 30
STEP_SIZE = 6
N_CLUSTERS = 5

# Paths
drive_folder = "/content/drive/MyDrive/take3"
data_folders = {
    "train": f"{drive_folder}/stock_data/train",
    "test": f"{drive_folder}/stock_data/test",
    "val": f"{drive_folder}/stock_data/val"
}
motifs_folders = {
    "train": f"{drive_folder}/motifs/train",
    "test": f"{drive_folder}/motifs/test",
    "val": f"{drive_folder}/motifs/val"
}
modis_folders = {
    "train": f"{drive_folder}/Modis/train",
    "test": f"{drive_folder}/Modis/test",
    "val": f"{drive_folder}/Modis/val"
}

# Ensure folders exist
for folder in list(motifs_folders.values()) + list(modis_folders.values()):
    os.makedirs(folder, exist_ok=True)


# Load CSV stock data into dictionary
def load_stock_data(folder_path):
    stock_data = {}
    files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    for file_name in tqdm(files, desc="📥 Loading stock data"):
        try:
            stock_symbol = file_name.replace('.csv', '')
            stock_df = pd.read_csv(os.path.join(folder_path, file_name))
            stock_data[stock_symbol] = stock_df
        except Exception as e:
            print(f"❌ Failed to load {file_name}: {e}")
    return stock_data


# Compute 1NN using DTW (consistent with medoid selection)
def compute_1nn_distances(motifs_A, motifs_B):
    return np.array([min(fastdtw(motif_A, motif_B)[0] for motif_B in motifs_B) for motif_A in motifs_A])


# Compute frequency-weighted ModIS similarity
def calculate_modis_distance(motifs_i, freqs_i, motifs_j, freqs_j):
    P_i_to_j = compute_1nn_distances(motifs_i, motifs_j)
    P_j_to_i = compute_1nn_distances(motifs_j, motifs_i)

    P_i_j = np.concatenate((P_i_to_j, P_j_to_i))
    K_i_j = np.concatenate((freqs_i, freqs_j))

    return np.exp(-np.sum(P_i_j * K_i_j) / np.sum(K_i_j))


# Extract motifs from sliding windows (only if not already present)
def extract_motifs_dynamic(stock_data, window_size, seq_length, step_size, n_clusters, save_folder, split):
    for ticker, data in tqdm(stock_data.items(), desc=f"🔍 Checking motifs for {split}"):
        prices = data['nor_Close'].values

        for start in range(0, len(prices) - window_size + 1, step_size):
            motif_file = os.path.join(save_folder, f"{ticker}_start{start}_motifs.pkl")
            if os.path.exists(motif_file):
                continue  # Skip if already exists

            window_data = prices[start:start + window_size]
            subsequences = [window_data[i:i + seq_length] for i in range(len(window_data) - seq_length + 1)]
            subsequences = np.array(subsequences)

            if len(subsequences) < 2:
                continue

            local_clusters = min(n_clusters, len(subsequences))
            kmeans = KMeans(n_clusters=local_clusters, random_state=42, n_init=10)
            labels = kmeans.fit_predict(subsequences)

            unique_labels, counts = np.unique(labels, return_counts=True)
            stock_motifs = []
            motif_frequencies_ordered = []

            for label in unique_labels:
                cluster_seqs = subsequences[labels == label]
                medoid = min(cluster_seqs, key=lambda x: np.sum([fastdtw(x, y)[0] for y in cluster_seqs]))
                stock_motifs.append(medoid)
                motif_frequencies_ordered.append(counts[label])

            motifs_dict = {'motifs': np.array(stock_motifs), 'frequencies': np.array(motif_frequencies_ordered)}

            with open(motif_file, 'wb') as f:
                pickle.dump(motifs_dict, f)


# Compute ModIS per window (only missing files)
def compute_modis_for_windows(motifs_folder, modis_folder, split):
    motif_files = [f for f in os.listdir(motifs_folder) if f.endswith('_motifs.pkl')]
    start_windows = range(0, 400 - WINDOW_SIZE + 1, STEP_SIZE)

    print(f"📌 Computing ModIS distances for {split}...")

    for start in tqdm(start_windows, desc=f"🧮 ModIS for {split}"):
        modis_file = os.path.join(modis_folder, f"modis_start{start}_{split}.pkl")
        if os.path.exists(modis_file):
            print("Skiiped")
            continue  # Skip if file exists

        stock_motifs = {}
        for file in motif_files:
            if f"_start{start}_" in file:
                stock_name = file.replace(f"_start{start}_motifs.pkl", '')
                with open(os.path.join(motifs_folder, file), 'rb') as f:
                    stock_motifs[stock_name] = pickle.load(f)

        if len(stock_motifs) < 2:
            continue

        stock_symbols = list(stock_motifs.keys())
        modis_distances = {}

        for i in range(len(stock_symbols)):
            for j in range(i + 1, len(stock_symbols)):
                s_i, s_j = stock_symbols[i], stock_symbols[j]
                m_i, f_i = stock_motifs[s_i]['motifs'], stock_motifs[s_i]['frequencies']
                m_j, f_j = stock_motifs[s_j]['motifs'], stock_motifs[s_j]['frequencies']

                distance = calculate_modis_distance(m_i, f_i, m_j, f_j)
                modis_distances[(s_i, s_j)] = distance
                modis_distances[(s_j, s_i)] = distance  # Symmetric

        with open(modis_file, 'wb') as f:
            pickle.dump(modis_distances, f)

        print(f"✅ Saved ModIS for {split} start {start}: {len(stock_motifs)} stocks")
# 💻 MAIN EXECUTION
for split in ['train','test', 'val']:
    print(f"\n🚀 Processing {split.upper()} dataset...")

    stock_data = load_stock_data(data_folders[split])
    print(f"✅ Loaded {len(stock_data)} stocks for {split}.")

    extract_motifs_dynamic(
        stock_data,
        window_size=WINDOW_SIZE,
        seq_length=SEQ_LENGTH,
        step_size=STEP_SIZE,
        n_clusters=N_CLUSTERS,
        save_folder=motifs_folders[split],
        split=split
    )
    print(f"✅ Motif extraction check/complete for {split}.")

    compute_modis_for_windows(motifs_folders[split], modis_folders[split], split)
    print(f"✅ ModIS computation check/complete for {split}.")

print("\n🎉✅ All processing completed successfully.")

In [None]:
import os
import pickle

def check_and_clean_modis_folder(modis_folder, split_name):
    total = 0
    empty = 0
    empty_files = []

    for file in sorted(os.listdir(modis_folder)):
        if file.endswith(f"_{split_name}.pkl"):
            total += 1
            file_path = os.path.join(modis_folder, file)
            with open(file_path, 'rb') as f:
                data = pickle.load(f)
                if not data:
                    empty += 1
                    empty_files.append(file)
                    os.remove(file_path)  # Delete empty file
                    print(f"🗑️ Deleted empty file: {file}")

    print(f"\n🔍 {split_name.upper()} ModIS Summary")
    print(f"   • Total checked: {total}")
    print(f"   • Empty + deleted: {empty}")
    if empty:
        print("   • Names of deleted files:")
        for f in empty_files:
            print(f"     - {f}")

# Example usage
check_and_clean_modis_folder("/content/drive/MyDrive/take3/Modis/train", "train")
check_and_clean_modis_folder("/content/drive/MyDrive/take3/Modis/test", "test")
check_and_clean_modis_folder("/content/drive/MyDrive/take3/Modis/val", "val")

In [None]:
import pandas as pd

agg_path = "/content/drive/MyDrive/take3/aggregated sheet.xlsx"

# Manually assign column names
agg_df = pd.read_excel(agg_path, header=None)
agg_df.columns = ["Fund Name", "Company Name", "Sector"]

print("✅ Columns renamed:")
print(agg_df.head())

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from tqdm import tqdm
import os
import time
from google.colab import drive

# --- Mount Google Drive ---
drive.mount('/content/drive')

# --- File paths ---
TICKER_FILE = "/content/drive/MyDrive/take3/ind_nifty500list_filtered_final.csv"
AGG_FILE = "/content/drive/MyDrive/take3/aggregated sheet.xlsx"
OUTPUT_FILE = "/content/drive/MyDrive/take3/prior_relationship_matrix.npy"

# --- Load ticker and aggregated data ---
tickers_df = pd.read_csv(TICKER_FILE)
tickers = tickers_df["Symbol"].dropna().unique().tolist()

agg_df = pd.read_excel(AGG_FILE, header=None)
agg_df.columns = ["Fund Name", "Company Name", "Sector"]

# --- Normalize company names for matching ---
agg_df['Company Name'] = (
    agg_df['Company Name']
    .str.lower()
    .str.replace(r'\blimited\b|\bltd\b', 'ltd', regex=True)
    .str.replace(r'[^\w\s]', '', regex=True)
    .str.strip()
)

# --- Fetch sector/industry from yfinance with delay and retries ---
stock_info = {}
for ticker in tqdm(tickers, desc="Fetching yfinance info"):
    for attempt in range(3):  # retry up to 3 times
        try:
            yf_ticker = ticker + ".NS"
            data = yf.Ticker(yf_ticker).info
            name = data.get("longName", "").lower().replace("limited", "ltd").replace(".", "").replace(",", "").strip()
            sector = data.get("sector", "")
            industry = data.get("industry", "")
            stock_info[ticker] = {"name": name, "sector": sector, "industry": industry}
            time.sleep(0.5)  # delay to avoid rate limits
            break
        except Exception as e:
            if attempt == 2:
                print(f"⚠️ Failed for {ticker}: {e}")
            time.sleep(2)  # wait longer before retry

# --- Build prior relationship matrix ---
N = len(tickers)
G = np.zeros((N, N))

for i in tqdm(range(N), desc="Building matrix"):
    for j in range(i + 1, N):
        ti, tj = tickers[i], tickers[j]
        info_i = stock_info.get(ti)
        info_j = stock_info.get(tj)

        if not info_i or not info_j:
            continue

        name_i = info_i["name"]
        name_j = info_j["name"]

        # Mutual fund co-holdings (m)
        funds_i = set(agg_df[agg_df['Company Name'] == name_i]['Fund Name'])
        funds_j = set(agg_df[agg_df['Company Name'] == name_j]['Fund Name'])
        m = len(funds_i & funds_j)

        # Sector/industry overlap (n)
        n = int(info_i["sector"] == info_j["sector"] or info_i["industry"] == info_j["industry"])

        # Total prior relationship g = m + n
        G[i, j] = G[j, i] = m + n

# --- Save matrix to Drive ---
np.save(OUTPUT_FILE, G)
print(f"✅ Prior relationship matrix saved to: {OUTPUT_FILE}")

In [None]:
import numpy as np
import pandas as pd

# Paths
TICKER_FILE = "/content/drive/MyDrive/take3/ind_nifty500list_filtered_final.csv"
MATRIX_FILE = "/content/drive/MyDrive/take3/prior_relationship_matrix.npy"
CSV_OUTPUT_FILE = "/content/drive/MyDrive/take3/prior_relationship_matrix.csv"

# Load tickers and matrix
tickers = pd.read_csv(TICKER_FILE)["Symbol"].dropna().unique().tolist()
matrix = np.load(MATRIX_FILE)

# Convert to DataFrame
df = pd.DataFrame(matrix, index=tickers, columns=tickers)

# Save as CSV
df.to_csv(CSV_OUTPUT_FILE)

print(f"✅ Matrix saved as CSV to: {CSV_OUTPUT_FILE}")


In [None]:
import os
import torch
import pickle
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

# ----- CONFIG -----
MODIS_DIR = "/content/drive/MyDrive/take3/Modis"
SAVE_DIR = "/content/drive/MyDrive/take3/DGLSTM_outputs"
os.makedirs(SAVE_DIR, exist_ok=True)

SPLITS = ['train', 'val', 'test']
STEPS_PER_SPLIT = {'train': 62, 'val': 22, 'test': 26}
STEP_SIZE = 6
STARTS = {
    split: list(range(0, STEPS_PER_SPLIT[split] + 1, STEP_SIZE))
    for split in SPLITS
}
HIDDEN_DIM = 256
EPOCHS = 30
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ----- LOAD STOCK ORDER -----
valid_tickers = list(pd.read_csv("/content/drive/MyDrive/take3/valid_tickers.csv")['Symbol'])
if "WIPRO" not in valid_tickers:
    valid_tickers.append("WIPRO")
valid_tickers = sorted(valid_tickers)
stock_index = {ticker: i for i, ticker in enumerate(valid_tickers)}
num_stocks = len(valid_tickers)

# ----- LOAD PRIOR -----
prior_matrix = np.load("/content/drive/MyDrive/take3/prior_relationship_matrix.npy")
prior_tensor = torch.tensor(prior_matrix, dtype=torch.float32, device=DEVICE)

# ----- Load ModIS Sequences -----
def load_modis_sequence(split):
    modis_sequence = []
    print(f"📂 Loading ModIS sequence for '{split}'...")
    for start in tqdm(STARTS[split], desc=f"⏳ {split.upper()}"):
        path = os.path.join(MODIS_DIR, split, f"modis_start{start}_{split}.pkl")
        with open(path, 'rb') as f:
            edges = pickle.load(f)

        adj = np.zeros((num_stocks, num_stocks), dtype=np.float32)
        for (s1, s2), val in edges.items():
            s1_clean = s1.split("_")[0]
            s2_clean = s2.split("_")[0]
            if s1_clean in stock_index and s2_clean in stock_index:
                i, j = stock_index[s1_clean], stock_index[s2_clean]
                adj[i, j] = val

        modis_sequence.append(torch.tensor(adj, dtype=torch.float32))
    return modis_sequence

# ----- Model -----
class DGLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(DGLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, batch_first=True)
        self.proj = nn.Linear(hidden_dim, input_dim)

    def forward(self, graphs, prior):
        T, N, _ = graphs.shape
        fused_graphs = []
        for t in range(T):
            decay = torch.exp(-torch.tensor(t / T, dtype=torch.float32, device=DEVICE))
            fused = graphs[t] * (1 - decay) + decay * prior
            fused_graphs.append(fused.unsqueeze(0))
        fused_graphs = torch.cat(fused_graphs, dim=0)
        lstm_input = fused_graphs.permute(1, 0, 2)  # [N, T, N]
        output, _ = self.lstm(lstm_input)
        final_hidden = output[:, -1, :]
        dynamic_graph = self.proj(final_hidden)
        return F.relu(dynamic_graph)

# ----- Training -----
def train_dglstm(train_seq, val_seq, prior_tensor, num_epochs=30, lr=1e-3):
    model = DGLSTM(input_dim=num_stocks, hidden_dim=HIDDEN_DIM).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    train_tensor = torch.stack(train_seq).to(DEVICE)  # [T_train, N, N]
    val_tensor = torch.stack(val_seq).to(DEVICE)      # [T_val, N, N]

    best_val_loss = float('inf')
    best_model_path = os.path.join(SAVE_DIR, "best_dglstm_model.pt")

    print("🧠 Training DGLSTM model...")
    for epoch in tqdm(range(num_epochs), desc="📚 Epochs"):
        model.train()
        optimizer.zero_grad()
        pred_train = model(train_tensor, prior_tensor)
        train_loss = F.mse_loss(pred_train, train_tensor[-1])
        train_loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            pred_val = model(val_tensor, prior_tensor)
            val_loss = F.mse_loss(pred_val, val_tensor[-1])

        if val_loss.item() < best_val_loss:
            best_val_loss = val_loss.item()
            torch.save(model.state_dict(), best_model_path)

        if epoch % 5 == 0:
            print(f"📝 Epoch {epoch}: Train Loss = {train_loss.item():.4f} | Val Loss = {val_loss.item():.4f}")
            print(f"Train Target stats: min={train_tensor[-1].min():.4f}, max={train_tensor[-1].max():.4f}, mean={train_tensor[-1].mean():.4f}")
            print(f"Predicted Graph stats: min={pred_train.min():.4f}, max={pred_train.max():.4f}, mean={pred_train.mean():.4f}")

    return best_model_path

# ----- Pipeline -----
dglstm_graphs = {}

# Load training + validation sequences
train_seq = load_modis_sequence('train')
val_seq = load_modis_sequence('val')
full_train_seq = train_seq + val_seq

# Train & load best model
best_model_path = train_dglstm(full_train_seq, val_seq, prior_tensor, num_epochs=EPOCHS)
best_model = DGLSTM(input_dim=num_stocks, hidden_dim=HIDDEN_DIM).to(DEVICE)
best_model.load_state_dict(torch.load(best_model_path))
best_model.eval()

# Generate graph for each split
for split in SPLITS:
    print(f"📈 Generating graph for {split}...")
    modis_seq = load_modis_sequence(split)
    graph_tensor = torch.stack(modis_seq).to(DEVICE)
    with torch.no_grad():
        graph = best_model(graph_tensor, prior_tensor)
    dglstm_graphs[split] = graph.cpu().numpy()
    np.save(os.path.join(SAVE_DIR, f"dglstm_graph_{split}.npy"), dglstm_graphs[split])

print("✅ All dynamic graphs saved.")


In [None]:
import os
import torch
import torch.nn as nn
import pickle
import numpy as np
import pandas as pd
from torch_geometric.nn import GATConv
from tqdm import tqdm
from sklearn.metrics import precision_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Paths
base_path = "/content/drive/MyDrive/take3"
embedding_path = os.path.join(base_path, "LSTM_embeddings/lstm_{}_embeddings.pkl")
graph_path = os.path.join(base_path, "DGLSTM_outputs/dglstm_graph_{}.npy")
label_path = "/content/drive/MyDrive/take3/stock_data/{split}/{company}_{split}.csv"

def load_embeddings(split):
    with open(embedding_path.format(split), "rb") as f:
        return pickle.load(f)

def load_graph(split, stock_list):
    adj = np.load(graph_path.format(split))
    edge_index, edge_weight = [], []
    for i in range(len(stock_list)):
        for j in range(len(stock_list)):
            if adj[i, j] != 0:
                edge_index.append([i, j])
                edge_weight.append(adj[i, j])
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_weight = torch.tensor(edge_weight, dtype=torch.float)
    return edge_index, edge_weight

def load_labels(split, stock_list):
    labels, returns = [], []
    for stock in tqdm(stock_list, f"Preparing for {split}"):
        csv_path = label_path.format(split=split, company=stock.replace(f"_{split}", ""))
        df = pd.read_csv(csv_path)
        if "Label" not in df.columns or "Return" not in df.columns:
            raise ValueError(f"'Label' or 'Return' column not found in {csv_path}")
        labels.append(df["Label"].values[-2])
        returns.append(df["Return"].values[-2])
    return torch.tensor(labels, dtype=torch.long), np.array(returns)

class GATPredictor(nn.Module):
    def __init__(self, in_dim=64, hidden_dim=64, heads=4):
        super(GATPredictor, self).__init__()
        self.gat1 = GATConv(in_dim, hidden_dim, heads=heads, dropout=0.4)
        self.gat2 = GATConv(hidden_dim * heads, hidden_dim, heads=heads, dropout=0.4)
        self.fc = nn.Linear(hidden_dim * heads, 1)

    def forward(self, x, edge_index, edge_weight):
        x = self.gat1(x, edge_index, edge_weight)
        x = self.gat2(x, edge_index, edge_weight)
        enhanced = x.clone()
        out = self.fc(enhanced).squeeze()
        return out, enhanced

def prepare_data(split):
    embeddings = load_embeddings(split)
    stock_list = sorted(embeddings.keys())
    features = np.stack([embeddings[s] for s in stock_list])
    norms = np.linalg.norm(features, axis=1, keepdims=True)
    features = features / np.clip(norms, 1e-8, None)
    x = torch.tensor(features, dtype=torch.float).to(device)
    edge_index, edge_weight = load_graph(split, stock_list)
    y, returns = load_labels(split, stock_list)
    return x, edge_index.to(device), edge_weight.to(device), y.to(device), returns, stock_list

def train_model():
    train_x, train_ei, train_ew, train_y, _, _ = prepare_data("train")
    val_x, val_ei, val_ew, val_y, _, _ = prepare_data("val")
    test_x, test_ei, test_ew, test_y, test_returns, test_stock_list = prepare_data("test")

    # Pos weight for imbalance handling
    pos_weight = torch.tensor([(train_y == 0).sum() / (train_y == 1).sum()], device=device)

    model = GATPredictor().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-3, weight_decay=1e-4)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    best_model = None
    best_val_loss = float("inf")
    train_losses, val_losses = [], []
    patience, counter = 10, 0

    print("Training GAT model...")
    for epoch in tqdm(range(50), desc="Epochs"):
        model.train()
        logits, _ = model(train_x, train_ei, train_ew)
        loss = criterion(logits, train_y.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_logits, _ = model(val_x, val_ei, val_ew)
            val_loss = criterion(val_logits, val_y.float())

        train_losses.append(loss.item())
        val_losses.append(val_loss.item())
        print(f"Epoch {epoch+1:02d}: Train Loss = {loss.item():.4f}, Val Loss = {val_loss.item():.4f}")

        if val_loss.item() < best_val_loss:
            best_val_loss = val_loss.item()
            best_model = model.state_dict()
            best_train_loss = loss.item()
            best_epoch = epoch + 1
            counter = 0
            print(f"✅ New best model at epoch {best_epoch}: Train Loss = {best_train_loss:.4f}, Val Loss = {best_val_loss:.4f}")
        else:
            counter += 1
            if counter >= patience and epoch >= 20:
                print("🔥 Early stopping triggered.")
                break

    torch.save(best_model, os.path.join(base_path, "gat_best_model.pth"))
    print(f"✅ Best model saved at: {os.path.join(base_path, 'gat_best_model.pth')}")
    print(f"🏆 Selected Best Epoch = {best_epoch}, Train Loss = {best_train_loss:.4f}, Val Loss = {best_val_loss:.4f}")


    model.load_state_dict(best_model)
    model.eval()
    with torch.no_grad():
        test_logits, enhanced = model(test_x, test_ei, test_ew)
        pred_scores = torch.sigmoid(test_logits)/10  # Apply sigmoid for evaluation only
        np.save(os.path.join(base_path, "enhanced_test_embeddings.npy"), enhanced.cpu().numpy())
        print("📦 Enhanced test embeddings saved.")

    return pred_scores.cpu().numpy(), test_y.cpu().numpy(), test_returns, test_stock_list, enhanced.cpu().numpy()

# Evaluation unchanged
# ... reuse your existing `evaluate(...)` function ...

from sklearn.metrics import mean_absolute_error
def evaluate(pred_scores, true_labels, return_ratios, stock_list, k_values=[5, 10, 15, 20]):
    result = {}
    pred_rank = np.argsort(-pred_scores)  # high score → higher rank
    true_return_rank = np.argsort(-return_ratios)  # high return → higher rank

    stock_to_pred_rank = {stock_list[idx]: rank + 1 for rank, idx in enumerate(pred_rank)}
    stock_to_return_rank = {stock_list[idx]: rank + 1 for rank, idx in enumerate(true_return_rank)}

    print("\nTop-K Evaluation (based on predicted scores vs true returns):")
    for k in k_values:
        print(f"\nTop-{k} stocks:")
        # Top-k by predicted scores
        topk_pred_idx = pred_rank[:k]
        topk_pred_stocks = [stock_list[i] for i in topk_pred_idx]
        topk_pred_labels = true_labels[topk_pred_idx]

        # Top-k by actual returns
        topk_true_idx = true_return_rank[:k]
        topk_true_stocks = [stock_list[i] for i in topk_true_idx]
        # Predicted labels (using 0.5 threshold)
        topk_predicted_labels = (pred_scores[topk_pred_idx] >= 0.5).astype(int)
        topk_true_labels = true_labels[topk_pred_idx]
        acc = np.mean(topk_predicted_labels == topk_true_labels)
        # Precision = how many predicted top-k are in true top-k return
        precision = len(set(topk_pred_stocks) & set(topk_true_stocks)) / k

        # MRR = average 1 / predicted rank for the true top-k return stocks
        mrr = np.sum([1 / stock_to_pred_rank[stock] for stock in topk_true_stocks])/k

        # IRR = average 1 / return rank of model-predicted top-k stocks
        irr = np.sum(return_ratios[topk_pred_idx]) - np.sum(pred_scores[topk_pred_idx])

        # MAE = mean absolute error between predicted scores and actual returns (for top-k predictions)
        mae = np.mean(np.abs(pred_scores[topk_pred_idx] - return_ratios[topk_pred_idx]))

        result[k] = {"accuracy": acc, "precision": precision, "mrr": mrr, "irr": irr, "mae": mae}

        print("Rank\tStock\t\tLabel\tScore\t\tReturn Rank\tActual Return")
        for rank, idx in enumerate(topk_pred_idx):
            stock = stock_list[idx]
            label = true_labels[idx]
            score = pred_scores[idx]
            return_rank = stock_to_return_rank[stock]
            ret_val = return_ratios[idx]
            print(f"{rank+1}\t{stock:<12}\t{label}\t{score:.4f}\t\t{return_rank}\t\t{ret_val:.4f}")
    print("Label 1 count:", (true_labels == 1).sum())
    print("Label 0 count:", (true_labels == 0).sum())
    return result
if __name__ == "__main__":
    pred_scores, true_labels, return_ratios, stock_list, final_embeddings = train_model()
    results = evaluate(pred_scores, true_labels, return_ratios, stock_list)
    print("\n📊 Final Metrics:")
    for k, metrics in results.items():
        print(f"Top-{k}: Accuracy={metrics['accuracy']:.4f}, Precision={metrics['precision']:.4f}, "
              f"MRR={metrics['mrr']:.4f}, IRR={metrics['irr']:.4f}, MAE={metrics['mae']:.4f}")
