# 01 - Data Collection and Cleaning

## Purpose
Download raw stock data using yfinance and save backups to data/raw/

## Code

In [None]:
import pandas as pd  # For data manipulation and analysis
import os  # For file system operations like creating directories

def generate_features(df):
    """
    Generate technical indicators and features from raw price data.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Raw stock data with at least 'Close' price column
        
    Returns:
    --------
    pandas.DataFrame
        Processed dataframe with additional technical features
    """
    df = df.copy()  # Create a copy to avoid modifying the original dataframe
    
    # Convert 'Close' column to numeric type
    # errors='coerce' will convert non-convertible values to NaN instead of raising an error
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    
    # Calculate daily returns (percentage change from previous day)
    # fill_method=None avoids using forward/backward filling which could leak future data
    df['Return'] = df['Close'].pct_change(fill_method=None)
    
    # Add lagged returns to capture time-series autocorrelation patterns
    # Lag_1 = previous day's return, Lag_2 = return from two days ago
    # These features help the model recognize recent market patterns
    df['Lag_1'] = df['Return'].shift(1)  # Shift values down by 1 row (1 day lag)
    df['Lag_2'] = df['Return'].shift(2)  # Shift values down by 2 rows (2 day lag)
    
    # Calculate Simple Moving Averages (SMA) for trend identification
    # SMA_5 = 5-day moving average (short-term trend)
    # SMA_10 = 10-day moving average (medium-term trend)
    df['SMA_5'] = df['Close'].rolling(window=5).mean()  # Average of last 5 days
    df['SMA_10'] = df['Close'].rolling(window=10).mean()  # Average of last 10 days
    
    # Calculate volatility as standard deviation of returns over 10-day window
    # Higher values indicate more market uncertainty and price fluctuation
    df['Volatility'] = df['Return'].rolling(window=10).std()
    
    # Remove rows with NaN values resulting from calculations requiring history
    # This ensures all data points have complete feature information
    return df.dropna()

# Create directory for storing processed data files if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

# List of stock tickers from different market sectors to process
tickers = ['AAPL', 'MSFT', 'JPM', 'BAC', 'XOM', 'CVX']  # Tech, Banking, and Energy sectors

# Process each ticker file individually
for ticker in tickers:
    # Read raw data from CSV file with dates as index
    df = pd.read_csv(f'../data/raw/{ticker}_raw.csv', index_col=0, parse_dates=True)
    
    # Ensure the index column is named 'Date' for consistency
    df.index.name = 'Date'
    
    # Generate technical features for this ticker
    processed = generate_features(df)
    
    # Save the processed data to CSV in the processed directory
    processed.to_csv(f'../data/processed/{ticker}_processed.csv')
    
    # Print confirmation message
    print(f"{ticker} processed.")

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


AAPL saved.
MSFT saved.


[*********************100%***********************]  1 of 1 completed


JPM saved.


[*********************100%***********************]  1 of 1 completed


BAC saved.


[*********************100%***********************]  1 of 1 completed


XOM saved.


[*********************100%***********************]  1 of 1 completed

CVX saved.



