In [3]:
import pandas as pd
import numpy as np
import os
import json
from pathlib import Path

# Set up paths - using absolute paths for reliability
raw_data_path = os.path.abspath("/root/nfs/AJ FinRag/Company Raw Data/all_companies.csv")
processed_data_dir = os.path.abspath("/root/nfs/AJ FinRag/Company Processed Data")

def verify_file_exists(file_path):
    """Check if file exists and is accessible"""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Input file not found at: {file_path}")
    if not os.path.isfile(file_path):
        raise ValueError(f"Path exists but is not a file: {file_path}")
    if not os.access(file_path, os.R_OK):
        raise PermissionError(f"Cannot read file: {file_path}")

def clean_and_drop_nan(df):
    """Clean and drop NaN values while preserving data structure"""
    # First replace inf/-inf with NaN
    df = df.replace([np.inf, -np.inf], np.nan)

    # For time series data, we don't want to drop all rows with any NaN
    # Instead, we'll handle different columns differently

    # For indicator columns, we can safely drop rows where all indicator values are NaN
    indicator_cols = ['MACD_Histogram', 'macd_crossover', 'bollinger_bands',
                     'exceeding_upper', 'exceeding_lower',
                     'overbought_and_oversold_conditions', 'kdj_crossover',
                     'Returns', 'VWAP', 'alpha_smr', 'alpha_mom']

    # For price/volume data, we should keep even if indicators are NaN
    price_cols = ['open', 'high', 'low', 'close', 'volume', 'adj_close']

    # Keep rows where we have at least one indicator value or all price data
    df = df.dropna(subset=price_cols, how='all')

    # For indicators, only drop rows where all indicators are NaN
    if len(indicator_cols) > 0:
        df = df.dropna(subset=indicator_cols, how='all')

    return df

def calculate_ema(data, period):
    """Calculate Exponential Moving Average"""
    return data.ewm(span=period).mean()

def calculate_sma(data, period):
    """Calculate Simple Moving Average"""
    return data.rolling(window=period).mean()

def calculate_macd(df, fast=12, slow=26, signal=9):
    """Calculate MACD indicator"""
    try:
        # Calculate EMAs
        ema_fast = calculate_ema(df['close'], fast)
        ema_slow = calculate_ema(df['close'], slow)
        
        # MACD line
        macd_line = ema_fast - ema_slow
        
        # Signal line
        signal_line = calculate_ema(macd_line, signal)
        
        # MACD Histogram
        df['MACD_Histogram'] = macd_line - signal_line
        
        # MACD Crossover (1 for bullish, -1 for bearish, 0 for no signal)
        macd_crossover = np.where(
            (macd_line.shift(1) <= signal_line.shift(1)) & (macd_line > signal_line), 1,
            np.where((macd_line.shift(1) >= signal_line.shift(1)) & (macd_line < signal_line), -1, 0)
        )
        df['macd_crossover'] = macd_crossover
        
        return df
    except Exception as e:
        df['MACD_Histogram'] = 0.0
        df['macd_crossover'] = None
        return df

def calculate_bollinger_bands(df, period=20, std_dev=2):
    """Calculate Bollinger Bands"""
    try:
        # Calculate moving average and standard deviation
        sma = calculate_sma(df['close'], period)
        std = df['close'].rolling(window=period).std()
        
        # Calculate bands
        upper_band = sma + (std * std_dev)
        lower_band = sma - (std * std_dev)
        
        # Bollinger Band signals
        df['bollinger_bands'] = np.where(df['close'] > upper_band, 1,
                                np.where(df['close'] < lower_band, -1, None))
        
        # Exceeding bands
        df['exceeding_upper'] = np.where(df['close'] > upper_band, 1, None)
        df['exceeding_lower'] = np.where(df['close'] < lower_band, 1, None)
        
        return df
    except Exception as e:
        df['bollinger_bands'] = None
        df['exceeding_upper'] = None
        df['exceeding_lower'] = None
        return df

def calculate_kdj(df, k_period=9, d_period=3, j_period=3):
    """Calculate KDJ indicator"""
    try:
        # Calculate %K
        low_min = df['low'].rolling(window=k_period).min()
        high_max = df['high'].rolling(window=k_period).max()
        
        k_percent = 100 * (df['close'] - low_min) / (high_max - low_min)
        
        # Calculate %D (smoothed %K)
        d_percent = k_percent.rolling(window=d_period).mean()
        
        # Calculate %J
        j_percent = 3 * k_percent - 2 * d_percent
        
        # Overbought/Oversold conditions
        overbought = (k_percent > 80) & (d_percent > 80)
        oversold = (k_percent < 20) & (d_percent < 20)
        
        df['overbought_and_oversold_conditions'] = np.where(overbought, 1,
                                                   np.where(oversold, -1, None))
        
        # KDJ Crossover
        kdj_crossover = np.where(
            (k_percent.shift(1) <= d_percent.shift(1)) & (k_percent > d_percent), 1,
            np.where((k_percent.shift(1) >= d_percent.shift(1)) & (k_percent < d_percent), -1, None)
        )
        df['kdj_crossover'] = kdj_crossover
        
        return df
    except Exception as e:
        df['overbought_and_oversold_conditions'] = None
        df['kdj_crossover'] = None
        return df

def calculate_returns(df):
    """Calculate returns"""
    try:
        df['Returns'] = df['adj_close'].pct_change()
        return df
    except Exception as e:
        df['Returns'] = None
        return df

def calculate_vwap(df):
    """Calculate Volume Weighted Average Price"""
    try:
        # Typical price
        typical_price = (df['high'] + df['low'] + df['close']) / 3
        
        # VWAP calculation
        df['VWAP'] = (typical_price * df['volume']).cumsum() / df['volume'].cumsum()
        
        return df
    except Exception as e:
        df['VWAP'] = df['close']
        return df

def add_mean_reversion_alpha(df, lookback=20):
    """Add mean reversion alpha factor"""
    try:
        # Calculate rolling mean and standard deviation
        rolling_mean = df['close'].rolling(window=lookback).mean()
        rolling_std = df['close'].rolling(window=lookback).std()
        
        # Z-score (how many standard deviations from mean)
        z_score = (df['close'] - rolling_mean) / rolling_std
        
        # Mean reversion signal (negative z-score indicates potential reversion)
        df['alpha_smr'] = -z_score * df['Returns'].shift(1)
        
        return df
    except Exception as e:
        df['alpha_smr'] = None
        return df

def add_momentum_alpha(df, short_window=5, long_window=20):
    """Add momentum alpha factor"""
    try:
        # Calculate short and long term moving averages
        short_ma = calculate_sma(df['close'], short_window)
        long_ma = calculate_sma(df['close'], long_window)
        
        # Momentum signal
        momentum_signal = (short_ma - long_ma) / long_ma
        
        # Momentum alpha (momentum signal * lagged returns)
        df['alpha_mom'] = momentum_signal * df['Returns'].shift(1)
        
        return df
    except Exception as e:
        df['alpha_mom'] = None
        return df

def calculate_all_indicators(df):
    """Calculate all technical indicators for a dataframe"""
    # Make sure data is sorted by date for each ticker
    df = df.sort_values(['ticker', 'date']).reset_index(drop=True)

    df = calculate_macd(df)
    df = calculate_bollinger_bands(df)
    df = calculate_kdj(df)
    df = calculate_returns(df)
    df = calculate_vwap(df)
    df = add_mean_reversion_alpha(df)
    df = add_momentum_alpha(df)
    return clean_and_drop_nan(df)

def process_dataframe_with_indicators(input_path, output_dir):
    """Process existing dataframe by adding technical indicators"""
    try:
        # Verify input file exists
        verify_file_exists(input_path)

        # Create output directory if it doesn't exist
        Path(output_dir).mkdir(parents=True, exist_ok=True)

        # Load the combined data
        print(f"Loading data from: {input_path}")
        df = pd.read_csv(input_path)

        # Convert date column to datetime if it's not already
        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            df['date'] = pd.to_datetime(df['date'])

        # Process each company separately
        processed_dfs = []
        tickers = df['ticker'].unique()

        print(f"Processing {len(tickers)} companies...")

        for ticker in tickers:
            print(f"\nProcessing indicators for {ticker}...")
            try:
                # Get data for this ticker
                ticker_data = df[df['ticker'] == ticker].copy()

                # Sort by date
                ticker_data = ticker_data.sort_values('date').reset_index(drop=True)

                # Calculate indicators
                processed_df = calculate_all_indicators(ticker_data)

                # Save individual company file (CSV)
                output_csv_path = os.path.join(output_dir, f"{ticker}_processed.csv")
                processed_df.to_csv(output_csv_path, index=False)
                print(f"Saved CSV: {output_csv_path}")

                # Save individual company file (JSON)
                output_json_path = os.path.join(output_dir, f"{ticker}_processed.json")
                processed_df.to_json(output_json_path, orient='records', lines=True)
                print(f"Saved JSON: {output_json_path}")

                processed_dfs.append(processed_df)
            except Exception as e:
                print(f"Error processing {ticker}: {str(e)}")
                continue

        # Combine all processed data
        if processed_dfs:
            combined_df = pd.concat(processed_dfs, ignore_index=True)

            # Save combined data (CSV)
            combined_csv_path = os.path.join(output_dir, "all_companies_processed.csv")
            combined_df.to_csv(combined_csv_path, index=False)
            print(f"\nSuccessfully saved combined CSV to {combined_csv_path}")

            # Save combined data (JSON)
            combined_json_path = os.path.join(output_dir, "all_companies_processed.json")
            combined_df.to_json(combined_json_path, orient='records', lines=True)
            print(f"Successfully saved combined JSON to {combined_json_path}")

            return combined_df
        else:
            print("\nNo data was processed successfully")
            return None

    except Exception as e:
        print(f"\nFatal error in processing: {str(e)}")
        return None

if __name__ == "__main__":
    print("Starting technical indicator processing...")
    print(f"Input file: {raw_data_path}")
    print(f"Output directory: {processed_data_dir}")

    try:
        processed_df = process_dataframe_with_indicators(raw_data_path, processed_data_dir)

        if processed_df is not None:
            print("\nProcessing complete! Sample of processed data:")
            print(f"Shape: {processed_df.shape}")
            print("\nColumns:", processed_df.columns.tolist())
            print("\nSample data:")
            print(processed_df.head())

            print("\nData summary:")
            print(f"Date range: {processed_df['date'].min()} to {processed_df['date'].max()}")
            print(f"Companies: {processed_df['ticker'].nunique()}")
            print(f"Total records: {len(processed_df)}")

            # Show NaN statistics
            print("\nNaN values in each column:")
            print(processed_df.isna().sum())
        else:
            print("\nProcessing completed with errors")
    except Exception as e:
        print(f"\nFailed to run processing: {str(e)}")

Starting technical indicator processing...
Input file: /root/nfs/AJ FinRag/Company Raw Data/all_companies.csv
Output directory: /root/nfs/AJ FinRag/Company Processed Data
Loading data from: /root/nfs/AJ FinRag/Company Raw Data/all_companies.csv
Processing 25 companies...

Processing indicators for AAPL...
Saved CSV: /root/nfs/AJ FinRag/Company Processed Data/AAPL_processed.csv
Saved JSON: /root/nfs/AJ FinRag/Company Processed Data/AAPL_processed.json

Processing indicators for MSFT...
Saved CSV: /root/nfs/AJ FinRag/Company Processed Data/MSFT_processed.csv
Saved JSON: /root/nfs/AJ FinRag/Company Processed Data/MSFT_processed.json

Processing indicators for GOOGL...
Saved CSV: /root/nfs/AJ FinRag/Company Processed Data/GOOGL_processed.csv
Saved JSON: /root/nfs/AJ FinRag/Company Processed Data/GOOGL_processed.json

Processing indicators for AMZN...
Saved CSV: /root/nfs/AJ FinRag/Company Processed Data/AMZN_processed.csv
Saved JSON: /root/nfs/AJ FinRag/Company Processed Data/AMZN_processe