In [6]:
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime, timedelta
import warnings
import time
from pandas.tseries.offsets import BDay

In [7]:
warnings.filterwarnings('ignore')

def calculate_technical_indicators(df):
    if len(df) < 7:
        return None
    features = {}
    features['close_price'] = df['Close'].iloc[-1]
    features['open_price'] = df['Open'].iloc[-1]
    features['high_price'] = df['High'].iloc[-1]
    features['low_price'] = df['Low'].iloc[-1]
    features['volume'] = df['Volume'].iloc[-1]
    features['daily_return'] = (df['Close'].iloc[-1] - df['Close'].iloc[-2]) / df['Close'].iloc[-2]
    features['weekly_return'] = (df['Close'].iloc[-1] - df['Close'].iloc[0]) / df['Close'].iloc[0]
    features['price_momentum_3d'] = (df['Close'].iloc[-1] - df['Close'].iloc[-4]) / df['Close'].iloc[-4] if len(df) >= 4 else 0
    features['sma_3'] = df['Close'].rolling(3).mean().iloc[-1]
    features['sma_7'] = df['Close'].rolling(7).mean().iloc[-1] if len(df) >= 7 else df['Close'].mean()
    features['ema_3'] = df['Close'].ewm(span=3).mean().iloc[-1]
    features['price_to_sma3'] = df['Close'].iloc[-1] / features['sma_3']
    features['price_to_sma7'] = df['Close'].iloc[-1] / features['sma_7']
    features['price_volatility'] = df['Close'].pct_change().std()
    features['high_low_ratio'] = df['High'].iloc[-1] / df['Low'].iloc[-1]
    features['close_to_high_ratio'] = df['Close'].iloc[-1] / df['High'].iloc[-1]
    features['close_to_low_ratio'] = df['Close'].iloc[-1] / df['Low'].iloc[-1]
    features['volume_sma'] = df['Volume'].rolling(3).mean().iloc[-1]
    features['volume_ratio'] = df['Volume'].iloc[-1] / features['volume_sma'] if features['volume_sma'] > 0 else 1
    features['price_volume'] = df['Close'].iloc[-1] * df['Volume'].iloc[-1]
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=7).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=7).mean()
    rs = gain / loss
    features['rsi'] = 100 - (100 / (1 + rs.iloc[-1])) if not np.isnan(rs.iloc[-1]) and rs.iloc[-1] != 0 else 50
    ema_12 = df['Close'].ewm(span=3).mean()
    ema_26 = df['Close'].ewm(span=5).mean()
    features['macd'] = (ema_12.iloc[-1] - ema_26.iloc[-1]) / df['Close'].iloc[-1]
    sma_bb = df['Close'].rolling(5).mean()
    std_bb = df['Close'].rolling(5).std()
    upper_bb = sma_bb + (std_bb * 2)
    lower_bb = sma_bb - (std_bb * 2)
    features['bb_position'] = (df['Close'].iloc[-1] - lower_bb.iloc[-1]) / (upper_bb.iloc[-1] - lower_bb.iloc[-1]) if (upper_bb.iloc[-1] - lower_bb.iloc[-1]) != 0 else 0.5
    recent_high = df['High'].rolling(7).max().iloc[-1]
    recent_low = df['Low'].rolling(7).min().iloc[-1]
    features['distance_to_high'] = (recent_high - df['Close'].iloc[-1]) / df['Close'].iloc[-1]
    features['distance_to_low'] = (df['Close'].iloc[-1] - recent_low) / df['Close'].iloc[-1]
    x = np.arange(len(df))
    y = df['Close'].values
    slope = np.polyfit(x, y, 1)[0]
    features['trend_slope'] = slope / df['Close'].iloc[-1]
    features['gap_up'] = max(0, (df['Open'].iloc[-1] - df['Close'].iloc[-2]) / df['Close'].iloc[-2]) if len(df) >= 2 else 0
    features['gap_down'] = max(0, (df['Close'].iloc[-2] - df['Open'].iloc[-1]) / df['Close'].iloc[-2]) if len(df) >= 2 else 0
    features['intraday_return'] = (df['Close'].iloc[-1] - df['Open'].iloc[-1]) / df['Open'].iloc[-1]
    features['intraday_high_reach'] = (df['High'].iloc[-1] - df['Open'].iloc[-1]) / df['Open'].iloc[-1]
    features['intraday_low_reach'] = (df['Open'].iloc[-1] - df['Low'].iloc[-1]) / df['Open'].iloc[-1]
    up_days = (df['Close'] > df['Open']).sum()
    features['bullish_days_ratio'] = up_days / len(df)
    features['vpt'] = ((df['Close'].pct_change() * df['Volume']).cumsum()).iloc[-1] / df['Volume'].sum() if df['Volume'].sum() > 0 else 0
    return features

def generate_enhanced_dataset(tickers, start_date, end_date, save_path="enhanced_stock_features.csv"):
    print(f"Generating enhanced dataset for {len(tickers)} tickers...")
    all_data = []
    failed_tickers = []
    for i, ticker in enumerate(tickers):
        try:
            print(f"Processing {ticker} ({i+1}/{len(tickers)})...")
            for attempt in range(3):
                try:
                    stock = yf.Ticker(ticker)
                    hist = stock.history(start=start_date, end=end_date)
                    if hist.empty:
                        raise ValueError("No data returned")
                    break
                except Exception as e:
                    print(f"Retry {attempt+1} for {ticker}: {e}")
                    time.sleep(1)
            else:
                print(f"Failed to get data for {ticker} after 3 attempts.")
                failed_tickers.append(ticker)
                continue
            if len(hist) < 7:
                print(f"Insufficient data for {ticker}")
                failed_tickers.append(ticker)
                continue
            for j in range(7, len(hist)):
                window_data = hist.iloc[j-7:j]
                features = calculate_technical_indicators(window_data)
                if features is None:
                    continue
                features['ticker'] = ticker
                features['date'] = hist.index[j-1].strftime('%Y-%m-%d')
                if j < len(hist) - 1:
                    future_return = (hist['Close'].iloc[j] - hist['Close'].iloc[j-1]) / hist['Close'].iloc[j-1]
                    if future_return > 0.02:
                        features['label'] = 'BUY'
                    elif future_return < -0.02:
                        features['label'] = 'SELL'
                    else:
                        features['label'] = 'HOLD'
                    all_data.append(features)
        except Exception as e:
            print(f"Error processing {ticker}: {e}")
            failed_tickers.append(ticker)
    df = pd.DataFrame(all_data)
    if len(df) > 0:
        df.to_csv(save_path, index=False)
        print(f"Dataset saved: {save_path}")
        print(f"Total samples: {len(df)}")
        print(f"Features: {len(df.columns) - 3}")
        print(f"Label distribution:")
        print(df['label'].value_counts())
        print(f"Failed tickers: {failed_tickers}")
        return df
    else:
        print("No data generated")
        return None

if __name__ == "__main__":
    all_tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "TSLA", "META", "BRK-B", "UNH", "JNJ", "JPM", "V", "PG", "MA", "HD", "XOM", "CVX", "LLY", "MRK", "PEP", "ABBV", "AVGO", "KO", "COST", "TMO", "WMT", "DIS", "MCD", "NFLX", "ADBE", "CRM", "PYPL", "INTC", "ORCL", "QCOM", "TXN", "AMD", "IBM", "HON", "AMGN", "GE", "CAT", "LOW", "BA", "GS", "BLK", "AXP", "DUK", "SO", "PLD", "CI", "ISRG", "NOW", "ADI", "MDT", "DHR", "BKNG", "MO", "BDX", "SYK", "CSCO", "ZTS", "LRCX", "ETN", "VRTX", "TGT", "APD", "MMC", "GILD", "CME", "NSC", "ITW", "DE", "AON", "SPGI", "ICE", "ADP", "EL", "ASML", "NXPI", "KLAC", "MU", "LULU", "MAR", "ROST", "DLTR", "CSX", "EBAY", "EXC", "ILMN", "WDAY", "TEAM", "ZM", "DOCU", "CDNS", "SNPS", "FTNT", "PANW", "OKTA", "DDOG", "ZS", "CRWD", "MDB", "BIDU", "NTES", "PDD", "JD", "BABA", "MELI", "SHOP", "SE", "ABNB", "ETSY", "RIVN", "LCID", "FSLY", "TWLO", "U", "COIN", "ROKU", "TTD", "NET", "WBD", "CHTR", "SPLK", "DOCN", "APP", "PLTR", "BILL", "TM", "NSANY", "HMC", "SONY", "VOD", "BP", "RIO", "BHP", "NTTYY", "BAYRY", "SNY", "AZN", "NVO", "SAP", "RY", "TD", "BNS", "ENB", "SU", "CNQ", "SHOP.TO", "BCE", "T.TO", "CM.TO", "BAM", "MFC", "TRP", "GIB", "L.TO", "BBD-B.TO", "MG.TO", "AC.TO", "QSR.TO", "ATD.TO", "FTS.TO", "NA.TO", "POW.TO", "IFC.TO", "SPY", "QQQ", "DIA", "VTI", "VOO", "ARKK", "XLF", "XLE", "XLK", "XLV"]

    end_date = datetime.today() - BDay(1)  # Last business day
    start_date = end_date - BDay(500)      # ~2 years of business days

    df = generate_enhanced_dataset(
        tickers=all_tickers,
        start_date=start_date.strftime('%Y-%m-%d'),
        end_date=end_date.strftime('%Y-%m-%d'),
        save_path="features_extended_2years.csv"
    )


Generating enhanced dataset for 177 tickers...
Processing AAPL (1/177)...
Processing MSFT (2/177)...
Processing GOOGL (3/177)...
Processing AMZN (4/177)...
Processing NVDA (5/177)...
Processing TSLA (6/177)...
Processing META (7/177)...
Processing BRK-B (8/177)...
Processing UNH (9/177)...
Processing JNJ (10/177)...
Processing JPM (11/177)...
Processing V (12/177)...
Processing PG (13/177)...
Processing MA (14/177)...
Processing HD (15/177)...
Processing XOM (16/177)...
Processing CVX (17/177)...
Processing LLY (18/177)...
Processing MRK (19/177)...
Processing PEP (20/177)...
Processing ABBV (21/177)...
Processing AVGO (22/177)...
Processing KO (23/177)...
Processing COST (24/177)...
Processing TMO (25/177)...
Processing WMT (26/177)...
Processing DIS (27/177)...
Processing MCD (28/177)...
Processing NFLX (29/177)...
Processing ADBE (30/177)...
Processing CRM (31/177)...
Processing PYPL (32/177)...
Processing INTC (33/177)...
Processing ORCL (34/177)...
Processing QCOM (35/177)...
Proc

ERROR:yfinance:$SPLK: possibly delisted; no timezone found


Retry 1 for SPLK: No data returned


ERROR:yfinance:$SPLK: possibly delisted; no timezone found


Retry 2 for SPLK: No data returned


ERROR:yfinance:$SPLK: possibly delisted; no timezone found


Retry 3 for SPLK: No data returned
Failed to get data for SPLK after 3 attempts.
Processing DOCN (126/177)...
Processing APP (127/177)...
Processing PLTR (128/177)...
Processing BILL (129/177)...
Processing TM (130/177)...
Processing NSANY (131/177)...
Processing HMC (132/177)...
Processing SONY (133/177)...
Processing VOD (134/177)...
Processing BP (135/177)...
Processing RIO (136/177)...
Processing BHP (137/177)...
Processing NTTYY (138/177)...
Processing BAYRY (139/177)...
Processing SNY (140/177)...
Processing AZN (141/177)...
Processing NVO (142/177)...
Processing SAP (143/177)...
Processing RY (144/177)...
Processing TD (145/177)...
Processing BNS (146/177)...
Processing ENB (147/177)...
Processing SU (148/177)...
Processing CNQ (149/177)...
Processing SHOP.TO (150/177)...
Processing BCE (151/177)...
Processing T.TO (152/177)...
Processing CM.TO (153/177)...
Processing BAM (154/177)...
Processing MFC (155/177)...
Processing TRP (156/177)...
Processing GIB (157/177)...
Processing 

In [8]:
from google.colab import drive
drive.mount('/content/drive')

# Copy to your Google Drive
!cp features_extended_2years.csv /content/drive/MyDrive/

Mounted at /content/drive


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
!ls -lh features_extended_2years.csv

-rw-r--r-- 1 root root 51M Jul 24 19:44 features_extended_2years.csv


In [11]:
!cp features_extended_2years.csv /content/drive/MyDrive/