In [1]:
import sys
# This command tells pip to install a version of numpy that is less than 2.0
# Using a specific stable version like 1.26.4 is often the most reliable approach.
!{sys.executable} -m pip install "numpy==1.26.4"



In [2]:
import pandas as pd
import pandas_ta as ta
import os
from tqdm import tqdm # A library to show progress bars

# ... and the rest of the code in that cell ...

# Define file paths
RAW_DATA_DIR = "../data/raw"
PROCESSED_DATA_DIR = "../data/processed"

# Create the processed data directory if it doesn't exist
if not os.path.exists(PROCESSED_DATA_DIR):
    os.makedirs(PROCESSED_DATA_DIR)

print("Libraries imported and paths defined.")

Libraries imported and paths defined.


  from pkg_resources import get_distribution, DistributionNotFound


In [3]:
# Create a dictionary to hold the data for each ticker
stock_data = {}

# Get a list of all CSV files in the raw data directory
csv_files = [f for f in os.listdir(RAW_DATA_DIR) if f.endswith('.csv')]

print(f"Found {len(csv_files)} data files. Loading...")

# Define the expected columns after skipping rows
# 'Date' will be the index, so it's not a regular column
expected_columns = ['Open', 'High', 'Low', 'Close', 'Volume'] # Adjusted and original columns

for file in tqdm(csv_files):
    ticker = file.split('.')[0]
    file_path = os.path.join(RAW_DATA_DIR, file)
    
    try:
        # --- CRITICAL CHANGE HERE ---
        # We tell pandas to skip the first 3 rows (0-indexed means rows 0, 1, 2)
        # Then we manually provide the column names, since the file's headers are messy
        df = pd.read_csv(
            file_path, 
            header=None,         # No header row in the file we're using
            skiprows=3,          # Skip the first 3 messy header rows
            names=['Date', 'Open', 'High', 'Low', 'Close', 'Volume'], # Explicitly set column names
            index_col='Date',    # Set 'Date' as the index
            parse_dates=True     # Parse the 'Date' column as datetime objects
        )
        
        # Some yfinance downloads might have duplicate columns if auto_adjust=True
        # removes Adjusted Close, but the old files included it.
        # Let's ensure we only have the columns we expect
        df = df[expected_columns]

        # If the file is empty or became empty after cleaning, skip it
        if df.empty:
            print(f"\n⚠️ Warning: File '{file}' is empty or became empty after cleaning. Skipping.")
            continue # Go to the next file in the loop
            
        stock_data[ticker] = df

    except Exception as e: # Catch any type of error during reading
        print(f"\n❌ Error processing file: {file}")
        print(f"   The specific error was: {e}")
        print("   This file might be malformed or has unexpected content. Skipping.")
        continue # Go to the next file in the loop


# Now, let's check if we have any data and inspect it
if stock_data:
    # Get the first available ticker to show a sample.
    sample_ticker = list(stock_data.keys())[0]
    print(f"\nSuccessfully loaded {len(stock_data)} tickers.")
    print(f"Sample data for {sample_ticker}:")
    print(stock_data[sample_ticker].head())
else:
    print("\nNo data was loaded successfully. Please check the files in data/raw.")

Found 102 data files. Loading...


100%|████████████████████████████████████████| 102/102 [00:00<00:00, 771.29it/s]


Successfully loaded 102 tickers.
Sample data for BA:
                  Open        High         Low       Close    Volume
Date                                                                
2021-01-04  202.720001  210.199997  202.490005  210.000000  21225600
2021-01-05  211.630005  213.350006  204.600006  204.740005  19338300
2021-01-06  211.029999  215.610001  209.339996  210.220001  16202200
2021-01-07  212.710007  216.600006  211.779999  213.389999  14474100
2021-01-08  209.899994  214.100006  208.160004  213.610001  14144000





In [4]:
# The '!' lets us run a terminal command. 'head -n 5' shows the top 5 lines of a file.
!head -n 5 ../data/raw/AAPL.csv

Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,,,,,
2021-01-04,126.23969268798828,130.33679730921182,123.6546114053764,130.24900571878715,143301900
2021-01-05,127.80046844482422,128.5125953811988,125.28367234969814,125.73240966734548,97664900


In [5]:
def create_features(df, spy_df, vix_df):
    data = df.copy()
    data.ta.rsi(length=14, append=True)
    data.ta.macd(fast=12, slow=26, signal=9, append=True)
    data.ta.bbands(length=20, std=2, append=True)
    data.ta.atr(length=14, append=True)
    data.ta.obv(append=True)
    data.ta.sma(length=20, append=True)
    data.ta.sma(length=50, append=True)
    stoch = ta.stoch(high=data['High'], low=data['Low'], close=data['Close'], k=14, d=3, smooth_k=3)
    data = data.join(stoch)
    
    data['return_5d'] = data['Close'].pct_change(5)
    data['return_10d'] = data['Close'].pct_change(10)
    data['volume_momentum'] = data['Volume'] / data['Volume'].rolling(20).mean()
    data['bollinger_pos'] = (data['Close'] - data['BBL_20_2.0']) / (data['BBU_20_2.0'] - data['BBL_20_2.0'])
    data['z_score_20d'] = (data['Close'] - data['SMA_20']) / data['Close'].rolling(20).std()
    data['volatility_20d'] = data['Close'].pct_change().rolling(20).std() * (252**0.5)
    
    spy_returns = spy_df['Close'].pct_change()
    data['relative_volatility'] = data['volatility_20d'] / (spy_returns.rolling(20).std() * (252**0.5))
    data['beta_20d'] = data['Close'].pct_change().rolling(20).corr(spy_returns)
    data['vix_level'] = vix_df['Close']
    
    for lag in [1, 3, 5]:
        data[f'rsi_lag_{lag}'] = data['RSI_14'].shift(lag)
    
    data['target_5d_forward_return'] = data['Close'].pct_change(5).shift(-5)
    return data

print("Feature engineering function created successfully.")

Feature engineering function created successfully.


In [6]:
spy_df = stock_data.pop('SPY')
vix_df = stock_data.pop('^VIX')
all_features = []
print("Applying feature engineering to all stocks...")

for ticker, df in tqdm(stock_data.items()):
    features = create_features(df, spy_df, vix_df)
    features['ticker'] = ticker
    all_features.append(features)

combined_df = pd.concat(all_features)
final_df = combined_df.dropna()

print("\nFeature engineering complete.")
print(f"Final dataset shape: {final_df.shape}")

Applying feature engineering to all stocks...


100%|████████████████████████████████████████| 100/100 [00:00<00:00, 167.24it/s]



Feature engineering complete.
Final dataset shape: (95000, 34)


In [7]:
import os
import config

# Use the absolute path from the config file for saving
output_path = config.PROCESSED_FILE
final_df.to_parquet(output_path)

print(f"✅ Attempted to save data to:\n{output_path}")

# --- THE MOST IMPORTANT VERIFICATION STEP ---
if os.path.exists(output_path):
    print("\n✅ SUCCESS: File has been verified and exists at the correct path.")
else:
    print("\n❌ CRITICAL FAILURE: File was NOT found after saving. Check permissions or disk space.")

--- CONFIG FILE LOADED ---
Project Root is: /Users/aryadoshii/Desktop/stock-return-prediction
Processed file will be at: /Users/aryadoshii/Desktop/stock-return-prediction/data/processed/all_features.parquet
--------------------------
✅ Attempted to save data to:
/Users/aryadoshii/Desktop/stock-return-prediction/data/processed/all_features.parquet

✅ SUCCESS: File has been verified and exists at the correct path.


In [8]:
import os
print(os.getcwd())

import sys
# Since config.py is in the project root, we add that to the path
sys.path.append('/Users/aryadoshii/Desktop/stock-return-prediction')
import config

# Use the absolute paths from the config file
RAW_DATA_DIR = config.RAW_DATA_DIR
PROCESSED_DATA_DIR = config.PROCESSED_DATA_DIR

if not os.path.exists(PROCESSED_DATA_DIR):
    os.makedirs(PROCESSED_DATA_DIR)

print("Libraries imported and paths defined from config.py.")
print(f"Processed data will be saved to: {config.PROCESSED_FILE}")

/Users/aryadoshii/Desktop/stock-return-prediction
Libraries imported and paths defined from config.py.
Processed data will be saved to: /Users/aryadoshii/Desktop/stock-return-prediction/data/processed/all_features.parquet
