In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib
import stat
stock_data = pd.read_csv('data/indexData.csv')
print(stock_data.columns)

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

In [3]:
import pandas as pd
import numpy as np

def calculate_features(df, stock_id_column='Index'):
    try:
        # Create copy to avoid modifying original data
        df_features = df.copy()
        
        # Ensure we have a stock identifier column
        if stock_id_column not in df_features.columns:
            raise KeyError(f"Missing stock identifier column: {stock_id_column}")
        
        # Set multi-index of [Symbol, Date] if not already set
        if not df_features.index.nlevels == 2:
            df_features = df_features.set_index([stock_id_column, 'Date'])
            
        # Function to calculate features for a single stock
        def calculate_single_stock_features(stock_data):
            # Returns
            stock_data['Returns'] = stock_data['Adj Close'].pct_change()
            
            # Moving Averages
            stock_data['MA10'] = stock_data['Adj Close'].rolling(window=10).mean()
            stock_data['MA50'] = stock_data['Adj Close'].rolling(window=50).mean()
            
            # Volatility
            stock_data['Volatility'] = stock_data['Returns'].rolling(window=20).std()
            
            # RSI
            delta = stock_data['Adj Close'].diff()
            gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
            loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
            rs = gain / loss
            stock_data['RSI'] = 100 - (100 / (1 + rs))
            
            # ATR
            high_low = stock_data['High'] - stock_data['Low']
            high_close = abs(stock_data['High'] - stock_data['Adj Close'].shift())
            low_close = abs(stock_data['Low'] - stock_data['Adj Close'].shift())
            tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
            stock_data['ATR'] = tr.rolling(window=14).mean()
            
            # Normalized Volume
            stock_data['Volume_Norm'] = (stock_data['Volume'] - stock_data['Volume'].rolling(window=20).mean()) / stock_data['Volume'].rolling(window=20).std()
            
            # Price Gap (only calculate within same stock)
            stock_data['Gap'] = stock_data['Open'] - stock_data['Adj Close'].shift(1)
            
            # High-Low Range
            stock_data['HL_Range'] = (stock_data['High'] - stock_data['Low']) / stock_data['Adj Close']
            
            return stock_data
        
        # Apply calculations to each stock separately
        df_features = df_features.groupby(level=0, group_keys=False).apply(calculate_single_stock_features)
        
        # Clean up NaN values from rolling calculations
        df_features = df_features.dropna()
        
        return df_features
    
    except KeyError as e:
        print(f"Error: Missing required column - {str(e)}")
        print("Required columns: Symbol, Date, Open, High, Low, Close, Adj Close, Volume")
        return None
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return None


In [None]:
stock_data = calculate_features(stock_data)
stock_data.head()

In [None]:
stock_data.columns
stock_data.ffill()
stock_data.bfill()

In [6]:
feature_scaler = StandardScaler()
feature_columns = ['Open','High','Low','Close','Volume','Returns','MA10', 'MA50', 'RSI', 'ATR', 'Volume_Norm', 'Volatility']
scaled_features = feature_scaler.fit_transform(stock_data[feature_columns])

target_scaler = MinMaxScaler()
scaled_target = target_scaler.fit_transform(stock_data[['Adj Close']])

In [7]:
def create_sequences(data, target, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):

        X.append(data[i:(i + seq_length)])
        y.append(target[i + seq_length])
        
    return np.array(X), np.array(y)


In [8]:
def check_data_quality(df):
  
    print("Missing values:\n", df.isnull().sum())
    
    print("\nInfinite values:\n", np.isinf(df).sum())
    
    z_scores = stat.zscore(df)
    print("\nExtreme outliers:\n", (abs(z_scores) > 3).sum())

def clean_data(df):
    
    df = df.fillna(method='ffill')
    
    df = df.fillna(method='bfill')
    
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(method='ffill')
    
    return df

In [9]:
X = scaled_features
y = scaled_target

train_size = int(len(X) * 0.7)
val_size = int(len(X) * 0.15)

X_train = X[:train_size]
y_train = y[:train_size]

X_val = X[train_size:train_size+val_size]
y_val = y[train_size:train_size+val_size]

X_test = X[train_size+val_size:]
y_test = y[train_size+val_size:]

In [10]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [11]:
class StockPricePredictor(nn.Module):
    def __init__(self, input_size):
        super(StockPricePredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.relu2 = nn.LeakyReLU()
        self.fc3 = nn.Linear(32, 16)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(16, 1)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.fc4(x)
        return x

In [None]:
import time

model = StockPricePredictor(input_size=12)
model
metric = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 25
batch_size = 64
best_loss = float('inf')
patience = 10
patience_counter = 0

l2_lamda = 0.1
train_losses = []
val_losses = []
# DataLoader with drop_last to ensure consistent batch sizes
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, drop_last=True
)

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        y_batch = y_batch.view(-1, 1)  # Ensure shape matches predictions
        # Forward pass
        predictions = model(X_batch)
        loss = metric(predictions, y_batch)

        l2_reg = sum(param.norm(2) for param in model.parameters())
        loss += l2_lamda * l2_reg

        epoch_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        avg_loss = epoch_loss / len(train_loader)
        train_losses.append(avg_loss)

    if (epoch+1) % 75 == 0 & epoch != (epochs - 1):
        time.sleep(45)

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.2f}, Average Loss: {avg_loss:.2f}")

In [None]:
model.eval()
with torch.no_grad():
    test_predictions = model(X_test_tensor)
    test_loss = metric(test_predictions, y_test_tensor)
    val_losses.append(test_loss.item())
    print(f"Test Loss: {test_loss.item():.2f}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(train_losses, label='Training Loss', color='blue')
plt.plot(val_losses, label='Validation Loss', color='orange')
plt.title('Training and Validation Losses')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.show()

In [None]:
np.random.seed(42)

# Create a date range
dates = pd.date_range(start='2001-01-01', end='2021-06-02', freq='B')  # Business days

num_samples = len(dates)
data = {
    'Open': np.random.uniform(low=6000, high=13000, size=num_samples),
    'High': np.random.uniform(low=6000, high=13000, size=num_samples),
    'Low': np.random.uniform(low=6000, high=13000, size=num_samples),
    'Close': np.random.uniform(low=6000, high=13000, size=num_samples),
    'Adj Close': np.random.uniform(low=6000, high=13000, size=num_samples),
    'Volume': np.random.randint(low=1e7, high=1e9, size=num_samples),
    'Returns': np.random.normal(loc=0, scale=0.01, size=num_samples),  # Daily returns
    'MA10': np.random.uniform(low=6000, high=13000, size=num_samples),
    'MA50': np.random.uniform(low=6000, high=13000, size=num_samples),
    'Volatility': np.random.uniform(low=0, high=0.02, size=num_samples),
    'RSI': np.random.uniform(low=0, high=100, size=num_samples),
    'ATR': np.random.uniform(low=0, high=5, size=num_samples),
    'Volume_Norm': np.random.uniform(low=0, high=1, size=num_samples),
    'Gap': np.random.uniform(low=-100, high=100, size=num_samples),
    'HL_Range': np.random.uniform(low=0, high=100, size=num_samples)
}

# Create a DataFrame
df = pd.DataFrame(data, index=dates)
df.index.name = 'Date'
df.index = pd.MultiIndex.from_product([['NYA'], df.index], names=['Index', 'Date'])

# Display the first few rows of the DataFrame
print(df.head())
