In [1]:
import pandas as pd

# File paths
file1_path = 'combined_stock_data2_filtered.csv'
file2_path = 'reshaped_engineered_features.csv'
output_path = 'merged_file.csv'

# Load the second dataset (df2) into memory
df2 = pd.read_csv(file2_path)

# Transform df2 into a suitable format for merging
transformed_data = []
for index, row in df2.iterrows():
    date = row['Date']
    indicator = row['Stock']
    for stock_symbol, value in row.items():
        if stock_symbol not in ['Date', 'Stock']:  # Skip the 'Date' and 'Stock' columns
            transformed_data.append({
                'Date': date,
                'Stock': stock_symbol.upper(),  # Convert to uppercase to match df1
                'Indicator': indicator,
                'Value': value
            })

# Convert the list to a DataFrame
df2_transformed = pd.DataFrame(transformed_data)

# Pivot the transformed df2
df2_pivoted = df2_transformed.pivot_table(index=['Date', 'Stock'], columns='Indicator', values='Value').reset_index()

# Define the chunk size for reading df1
chunk_size = 10000  # Adjust based on your memory constraints

# Initialize a flag to check if the output file is empty
first_chunk = True

# Read and process df1 in chunks
for chunk in pd.read_csv(file1_path, chunksize=chunk_size):
    # Merge the chunk with df2_pivoted
    merged_chunk = pd.merge(chunk, df2_pivoted, on=['Date', 'Stock'], how='inner')

    # Append the merged chunk to the output file
    if first_chunk:
        # Write header for the first chunk
        merged_chunk.to_csv(output_path, index=False, mode='w')
        first_chunk = False
    else:
        # Append without header for subsequent chunks
        merged_chunk.to_csv(output_path, index=False, mode='a', header=False)

print("Merged data saved to 'merged_file.csv'")

Merged data saved to 'merged_file.csv'


In [3]:
import pandas as pd
df=pd.read_csv('merged_file.csv')
print(df.head())

         Date       Open       High        Low      Close  Adj Close  \
0  2002-02-20  20.028612  21.316166  20.028612  20.643776  17.754581   
1  2002-02-20  86.267700  88.622643  86.147552  87.973831  67.761444   
2  2002-02-20   2.060000   2.190000   2.060000   2.190000   2.023804   
3  2002-02-20   4.933333   4.933333   4.933333   4.933333   4.613485   
4  2002-02-20   2.632428   2.719342   2.632428   2.714074   2.305438   

      Volume Stock      Macd  Macd_Signal    Return        Rsi  Volatility  
0  7024900.0     A -0.564911    -0.703869  0.107869  53.930432    0.586696  
1  1514400.0    AA  0.345131    -0.203684  0.038581  57.389682    0.377003  
2      300.0  AAME -0.068111    -0.075604  0.000000  51.213557    0.797065  
3        0.0   AAN  0.220614     0.225360  0.000000  65.632980    0.502426  
4    87000.0  AAON -0.007262    -0.006858  0.040909  48.335447    0.418702  


In [10]:
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock,Macd,Macd_Signal,Return,Rsi,Volatility,Year
11356167,2020-04-01,10.14,10.34,9.31,9.47,9.47,34000.0,ZEUS,-0.825304,-0.973804,-0.085024,39.881108,0.971739,2020
11356168,2020-04-01,25.870001,26.290001,25.02,25.32,25.32,3837200.0,ZION,-3.62744,-4.124144,-0.053812,30.347153,0.86126,2020
11356169,2020-04-01,4.11,4.16,3.8,3.82,3.82,539500.0,ZIXI,-0.791701,-0.870836,-0.113689,35.274266,1.612495,2020
11356170,2020-04-01,20.889999,21.190001,20.290001,20.389999,20.389999,33800.0,ZNH,-1.737241,-1.818122,-0.061234,36.502998,0.982866,2020
11356171,2020-04-01,6.99,6.99,6.63,6.74,6.74,193400.0,ZTR,-0.87445,-1.037236,-0.063889,38.583506,1.524556,2020


In [7]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d', errors='coerce')  # Adjust the format if necessary

# Extract the year from the 'Date' column
df['Year'] = df['Date'].dt.year

# Get the unique years
unique_years = df['Year'].unique()

# Print the unique years
print("Unique years in the DataFrame:")
print(sorted(unique_years))

Unique years in the DataFrame:
[2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]


In [4]:
import pandas as pd

# Step 1: Load the original dataset
print("Loading original dataset...")
df = pd.read_csv('combined_stock_data2.csv', parse_dates=['Date'])

# Step 2: Filter out entries before 2002
print("Filtering entries before 2002...")
df = df[df['Date'] >= '2002-01-01']

# Step 3: Save the filtered dataset to a new CSV file
output_file = 'combined_stock_data2_filtered.csv'
df.to_csv(output_file, index=False)
print(f"Filtered dataset saved to '{output_file}'.")

Loading original dataset...
Filtering entries before 2002...
Filtered dataset saved to 'combined_stock_data2_filtered.csv'.


In [8]:
df['Year'] = df['Date'].dt.year

# Group by 'Year' and count the number of entries for each year
entries_per_year = df.groupby('Year').size().reset_index(name='Entries')

# Print the result
print("Number of entries for each year:")
print(entries_per_year)

Number of entries for each year:
    Year  Entries
0   2002   517143
1   2003   618061
2   2004   630063
3   2005   630155
4   2006   627572
5   2007   627685
6   2008   632585
7   2009   629438
8   2010   629929
9   2011   629806
10  2012   625241
11  2013   630194
12  2014   629708
13  2015   629611
14  2016   630063
15  2017   627516
16  2018   625696
17  2019   628464
18  2020   157242


In [9]:
columns_to_check = ['Macd', 'Rsi', 'Volatility', 'Return']

# Group by 'Year' and count NaN values for each column
nan_counts_per_year = df.groupby('Year')[columns_to_check].apply(lambda x: x.isna().sum()).reset_index()

# Print the result
print("Number of NaN values for each year:")
print(nan_counts_per_year)

Number of NaN values for each year:
    Year  Macd  Rsi  Volatility  Return
0   2002     0    0           0       0
1   2003     0    0           0       0
2   2004     0    0           0       0
3   2005     0    0           0       0
4   2006     0    0           0       0
5   2007     0    0           0       0
6   2008     0    0           0       0
7   2009     0    0           0       0
8   2010     0    0           0       0
9   2011     0    0           0       0
10  2012     0    0           0       0
11  2013     0    0           0       0
12  2014     0    0           0       0
13  2015     0    0           0       0
14  2016     0    0           0       0
15  2017     0    0           0       0
16  2018     0    0           0       0
17  2019     0    0           0       0
18  2020     0    0           0       0


In [14]:
import pandas as pd
import talib  # Import TA-Lib
from tqdm import tqdm  # For progress bar

# Function to calculate RSI using TA-Lib
def calculate_rsi(prices, period=14):
    """
    Calculate the Relative Strength Index (RSI) using TA-Lib.
    """
    return talib.RSI(prices, timeperiod=period)

# Function to calculate MACD using TA-Lib
def calculate_macd(prices, short_period=12, long_period=26, signal_period=9):
    """
    Calculate the MACD and Signal Line using TA-Lib.
    """
    macd, signal, _ = talib.MACD(prices, fastperiod=short_period, slowperiod=long_period, signalperiod=signal_period)
    return macd, signal

# Step 1: Load the cleaned dataset in chunks
chunk_size = 10000  # Adjust based on your system's memory
chunks = pd.read_csv('cleaned_adjusted_close_prices.csv', parse_dates=['Date'], chunksize=chunk_size)

# Initialize a list to store processed chunks
processed_chunks = []

# Process each chunk
for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
    # Set 'Date' as the index
    chunk.set_index('Date', inplace=True)

    # Feature Engineering
    all_features = {}
    for stock in chunk.columns:
        prices = chunk[stock]
        returns = prices.pct_change().dropna()
        volatility = returns.rolling(window=21).std() * np.sqrt(252)
        rsi = calculate_rsi(prices, period=14)
        macd, signal = calculate_macd(prices, short_period=12, long_period=26, signal_period=9)
        stock_features = pd.DataFrame({
            f'{stock}_Return': returns,
            f'{stock}_Volatility': volatility,
            f'{stock}_RSI': rsi,
            f'{stock}_MACD': macd,
            f'{stock}_MACD_Signal': signal
        })
        all_features[stock] = stock_features

    # Skip this chunk if no features were computed
    if not all_features:
        print(f"Chunk {i + 1} has no features computed. Skipping...")
        continue

    # Combine features for all stocks into a single DataFrame
    features = pd.concat(all_features.values(), axis=1).dropna()

    # Skip this chunk if no features are available after concatenation
    if features.empty:
        print(f"Chunk {i + 1} has no features after concatenation. Skipping...")
        continue

    # Append the processed chunk to the list
    processed_chunks.append(features)

# Combine all processed chunks into a single DataFrame
if processed_chunks:
    final_features = pd.concat(processed_chunks)
    # Save the final features to a new CSV file
    final_features.to_csv('engineered_features.csv')
    print("\nEngineered features saved to 'engineered_features.csv'.")
else:
    print("No features were computed. Check the input dataset and filtering conditions.")

Processing chunks: 1it [00:11, 11.15s/it]



Engineered features saved to 'engineered_features.csv'.


In [2]:
# Step 4: Load the reshaped engineered features
print("Loading reshaped engineered features...")
reshaped_features = pd.read_csv('reshaped_engineered_features.csv', parse_dates=['Date'])

# Step 5: Load the original dataset in chunks
chunk_size = 10000  # Adjust based on your system's memory
chunks = pd.read_csv('combined_stock_data2.csv', parse_dates=['Date'], chunksize=chunk_size)

# Step 6: Initialize the output CSV file
output_file = 'original_with_engineered_features_chunked.csv'
header = True  # Write header only for the first chunk

# Step 7: Process each chunk and write to CSV
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i + 1}...")
    
    # Ensure the column names are consistent
    chunk.columns = chunk.columns.str.strip().str.title()
    
    # Merge the chunk with the reshaped engineered features on 'Date' and 'Stock'
    merged_chunk = pd.merge(
        chunk,  # Original chunk
        reshaped_features,  # Reshaped engineered features
        on=['Date', 'Stock'],  # Merge on Date and Stock
        how='inner'  # Keep only rows with matching Date and Stock
    )
    
    # Write the merged chunk to the output CSV file
    merged_chunk.to_csv(output_file, mode='a', index=False, header=header)
    header = False  # Do not write header for subsequent chunks

print("\nFinal dataset saved to 'original_with_engineered_features_chunked.csv'.")

Loading reshaped engineered features...
Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
Processing chunk 16...
Processing chunk 17...
Processing chunk 18...
Processing chunk 19...
Processing chunk 20...
Processing chunk 21...
Processing chunk 22...
Processing chunk 23...
Processing chunk 24...
Processing chunk 25...
Processing chunk 26...
Processing chunk 27...
Processing chunk 28...
Processing chunk 29...
Processing chunk 30...
Processing chunk 31...
Processing chunk 32...
Processing chunk 33...
Processing chunk 34...
Processing chunk 35...
Processing chunk 36...
Processing chunk 37...
Processing chunk 38...
Processing chunk 39...
Processing chunk 40...
Processing chunk 41...
Processing chunk 42...
Pro

In [None]:
def calculate_rsi(prices, period=14):
    """
    Calculate the Relative Strength Index (RSI) using TA-Lib.
    """
    return talib.RSI(prices, timeperiod=period)

# Function to calculate MACD using TA-Lib
def calculate_macd(prices, short_period=12, long_period=26, signal_period=9):
    """
    Calculate the MACD and Signal Line using TA-Lib.
    """
    macd, signal, _ = talib.MACD(prices, fastperiod=short_period, slowperiod=long_period, signalperiod=signal_period)
    return macd, signal

# Initialize a dictionary to store features for all stocks
all_features = {}
for stock in adjusted_close_prices.columns:
    prices = adjusted_close_prices[stock]
    
    # Calculate daily returns
    returns = prices.pct_change().dropna()
    
    # Calculate rolling volatility (annualized)
    volatility = returns.rolling(window=21).std() * np.sqrt(252)
    
    # Calculate RSI using TA-Lib
    rsi = calculate_rsi(prices, period=14)
    
    # Calculate MACD and Signal Line using TA-Lib
    macd, signal = calculate_macd(prices, short_period=12, long_period=26, signal_period=9)
    
    # Combine features for this stock into a DataFrame
    stock_features = pd.DataFrame({
        f'{stock}_Return': returns,
        f'{stock}_Volatility': volatility,
        f'{stock}_RSI': rsi,
        f'{stock}_MACD': macd,
        f'{stock}_MACD_Signal': signal
    })
    
    # Add the stock's features to the dictionary
    all_features[stock] = stock_features

# Combine features for all stocks into a single DataFrame
features = pd.concat(all_features.values(), axis=1).dropna()

# Display the features DataFrame
print("\nEngineered Features Shape:", features.shape)
print("\nEngineered Features Preview:")
print(features.head())

# ------------------------------------------------------------------------------------
# Step 8: Merge the original dataset with the new features
# ------------------------------------------------------------------------------------

# Reset the index of the original dataset to merge on 'Date'
df.reset_index(inplace=True)

# Merge the original dataset with the features DataFrame on 'Date'
merged_df = pd.merge(df, features, left_on='Date', right_index=True, how='inner')

# Display the merged dataset shape and preview
print("\nMerged Dataset Shape:", merged_df.shape)
print("\nMerged Dataset Preview:")
print(merged_df.head())

# ------------------------------------------------------------------------------------
# Step 9: Save the merged dataset to a new CSV file
# ------------------------------------------------------------------------------------

merged_df.to_csv('original_with_new_features.csv', index=False)
print("\nMerged dataset saved to 'original_with_new_features.csv'.")

In [1]:
import numpy as np
import pandas as pd

# Step 1: Load the dataset
df = pd.read_csv('combined_stock_data2.csv', parse_dates=['Date'])

# Ensure the column names are consistent
df.columns = df.columns.str.strip().str.title()

# Display the first few rows of the dataset
print("Original Dataset:")
print(df.head())

# Step 2: Pivot the data to get adjusted close prices per stock
adjusted_close_prices = df.pivot(index='Date', columns='Stock', values='Adj Close')

# Display the shape of the pivoted data
print("\nPivoted Dataset Shape:", adjusted_close_prices.shape)

# Step 3: Restrict the time range
# Focus on recent years (e.g., 2002 onwards)
start_date = "2002-01-01"
adjusted_close_prices = adjusted_close_prices.loc[start_date:]

# Display the shape after restricting the time range
print("\nDataset Shape After Restricting Time Range:", adjusted_close_prices.shape)

# Step 4: Drop rows (dates) with excessive missing data
# Define a threshold for acceptable missing data per row (e.g., 70% valid data required)
row_threshold = 0.7 * adjusted_close_prices.shape[1]  # Allow up to 30% missing data per row
adjusted_close_prices = adjusted_close_prices.loc[adjusted_close_prices.isna().sum(axis=1) < row_threshold]

# Display the shape after dropping rows with excessive missing data
print("\nDataset Shape After Dropping Rows with Excessive Missing Data:", adjusted_close_prices.shape)

# Step 5: Drop stocks (columns) with excessive missing data
# Define a threshold for acceptable missing data per stock (e.g., 10% missing data allowed)
stock_threshold = 0.1 * adjusted_close_prices.shape[0]  # Allow up to 10% missing data per stock
adjusted_close_prices = adjusted_close_prices.loc[:, adjusted_close_prices.isna().sum() < stock_threshold]

# Display the shape after dropping stocks with excessive missing data
print("\nDataset Shape After Dropping Stocks with Excessive Missing Data:", adjusted_close_prices.shape)

# Step 6: Fill remaining missing values
# Use forward-fill and backward-fill to handle remaining NaNs
adjusted_close_prices.fillna(method='ffill', inplace=True)
adjusted_close_prices.fillna(method='bfill', inplace=True)

# Display the final dataset shape and preview
print("\nFinal Dataset Shape:", adjusted_close_prices.shape)
print("\nFinal Dataset Preview:")
print(adjusted_close_prices.head())

# Optional: Save the cleaned dataset to a new file
adjusted_close_prices.to_csv('cleaned_adjusted_close_prices.csv')

Original Dataset:
        Date      Open      High       Low     Close  Adj Close    Volume  \
0 1962-01-02  6.532155  6.556185  6.532155  6.532155   1.536658   55900.0   
1 1962-01-02  6.125844  6.160982  6.125844  6.125844   1.414651   59700.0   
2 1962-01-02  0.837449  0.837449  0.823045  0.823045   0.145748  352200.0   
3 1962-01-02  1.604167  1.619792  1.588542  1.604167   0.136957  163200.0   
4 1962-01-02  0.000000  3.296131  3.244048  3.296131   0.051993  105600.0   

  Stock  
0    AA  
1  ARNC  
2    BA  
3   CAT  
4   CVX  

Pivoted Dataset Shape: (14717, 5884)

Dataset Shape After Restricting Time Range: (4617, 5884)

Dataset Shape After Dropping Rows with Excessive Missing Data: (4594, 5884)

Dataset Shape After Dropping Stocks with Excessive Missing Data: (4594, 2501)


  adjusted_close_prices.fillna(method='ffill', inplace=True)
  adjusted_close_prices.fillna(method='bfill', inplace=True)



Final Dataset Shape: (4594, 2501)

Final Dataset Preview:
Stock               A         AA      AAME       AAN      AAON        AAP  \
Date                                                                        
2002-01-02  17.994514  65.743896  2.051527  3.740664  2.460924  15.107107   
2002-01-03  19.132622  66.591476  2.125456  3.823791  2.288659  14.322977   
2002-01-04  20.166155  68.728867  2.097733  3.823791  2.349064  13.437671   
2002-01-07  20.086178  70.313538  2.116215  3.823791  2.347945  13.216344   
2002-01-08  20.147697  68.802628  2.310278  4.161834  2.349064  12.995018   

Stock           AAPL      AAXN         AB       ABB  ...       YUM       YUMA  \
Date                                                 ...                        
2002-01-02  1.444668  1.115833  13.765452  5.096856  ...  5.662119  95.979019   
2002-01-03  1.462029  1.120833  13.919203  5.467911  ...  5.781286  95.979019   
2002-01-04  1.468849  1.100833  14.249454  5.725589  ...  6.005459  95.979019

In [2]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock
0,1962-01-02,6.532155,6.556185,6.532155,6.532155,1.536658,55900.0,AA
1,1962-01-02,6.125844,6.160982,6.125844,6.125844,1.414651,59700.0,ARNC
2,1962-01-02,0.837449,0.837449,0.823045,0.823045,0.145748,352200.0,BA
3,1962-01-02,1.604167,1.619792,1.588542,1.604167,0.136957,163200.0,CAT
4,1962-01-02,0.0,3.296131,3.244048,3.296131,0.051993,105600.0,CVX


In [3]:
import numpy as np
import pandas as pd
import talib  # Import TA-Lib

# ------------------------------------------------------------------------------------
# Step 6: Feature Engineering (Using TA-Lib)
# ------------------------------------------------------------------------------------

# Function to calculate RSI using TA-Lib
def calculate_rsi(prices, period=14):
    """
    Calculate the Relative Strength Index (RSI) using TA-Lib.
    """
    return talib.RSI(prices, timeperiod=period)

# Function to calculate MACD using TA-Lib
def calculate_macd(prices, short_period=12, long_period=26, signal_period=9):
    """
    Calculate the MACD and Signal Line using TA-Lib.
    """
    macd, signal, _ = talib.MACD(prices, fastperiod=short_period, slowperiod=long_period, signalperiod=signal_period)
    return macd, signal

# Initialize a dictionary to store features for all stocks
all_features = {}

# Loop through each stock to calculate features
for stock in adjusted_close_prices.columns:
    prices = adjusted_close_prices[stock]
    
    # Calculate daily returns
    returns = prices.pct_change().dropna()
    
    # Calculate rolling volatility (annualized)
    volatility = returns.rolling(window=21).std() * np.sqrt(252)
    
    # Calculate RSI using TA-Lib
    rsi = calculate_rsi(prices, period=14)
    
    # Calculate MACD and Signal Line using TA-Lib
    macd, signal = calculate_macd(prices, short_period=12, long_period=26, signal_period=9)
    
    # Combine features for this stock into a DataFrame
    stock_features = pd.DataFrame({
        f'{stock}_Return': returns,
        f'{stock}_Volatility': volatility,
        f'{stock}_RSI': rsi,
        f'{stock}_MACD': macd,
        f'{stock}_MACD_Signal': signal
    })
    
    # Add the stock's features to the dictionary
    all_features[stock] = stock_features

# Combine features for all stocks into a single DataFrame
features = pd.concat(all_features.values(), axis=1).dropna()

# Display the features DataFrame
print("\nEngineered Features Shape:", features.shape)
print("\nEngineered Features Preview:")
print(features.head())

# Optional: Save the features DataFrame to a new file
features.to_csv('engineered_features.csv')


Engineered Features Shape: (4561, 12505)

Engineered Features Preview:
            A_Return  A_Volatility      A_RSI    A_MACD  A_MACD_Signal  \
Date                                                                     
2002-02-20  0.107869      0.586696  53.930432 -0.564911      -0.703869   
2002-02-21 -0.024255      0.568822  50.728646 -0.492460      -0.661587   
2002-02-22  0.003551      0.559301  51.174603 -0.425176      -0.614305   
2002-02-25  0.026185      0.558100  54.459439 -0.331299      -0.557704   
2002-02-26  0.024483      0.563412  57.419435 -0.219130      -0.489989   

            AA_Return  AA_Volatility     AA_RSI   AA_MACD  AA_MACD_Signal  \
Date                                                                        
2002-02-20   0.038581       0.377003  57.389682  0.345131       -0.203684   
2002-02-21   0.023764       0.382840  61.223868  0.600317       -0.042884   
2002-02-22   0.008538       0.371582  62.558369  0.840655        0.133824   
2002-02-25   0.006085   

In [10]:
# Check if the index is a DatetimeIndex
print("Is the index a DatetimeIndex?", isinstance(adjusted_close_prices.index, pd.DatetimeIndex))

# Check for missing dates
print("Missing dates in the index:", adjusted_close_prices.index[adjusted_close_prices.index.isnull()])
# Fill missing dates and forward-fill missing values
adjusted_close_prices = adjusted_close_prices.asfreq('B').ffill()

# Verify that there are no missing dates
print("Missing dates after filling:", adjusted_close_prices.index[adjusted_close_prices.index.isnull()])
# Infer and set the frequency explicitly
freq = pd.infer_freq(adjusted_close_prices.index)
if freq is None:
    freq = 'B'  # Default to business days if frequency cannot be inferred
adjusted_close_prices.index.freq = freq

# Verify the frequency
print("Frequency of the index:", adjusted_close_prices.index.freq)

Is the index a DatetimeIndex? True
Missing dates in the index: DatetimeIndex([], dtype='datetime64[ns]', name='Date', freq='B')
Missing dates after filling: DatetimeIndex([], dtype='datetime64[ns]', name='Date', freq='B')
Frequency of the index: <BusinessDay>


In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error
import warnings

# Suppress ARIMA warnings
'''warnings.filterwarnings("ignore", category=UserWarning, module="statsmodels")
warnings.filterwarnings("ignore", category=ConvergenceWarning, module="statsmodels")'''

# ------------------------------------------------------------------------------------
# Step 1: Check and Make Data Stationary
# ------------------------------------------------------------------------------------

# Function to check stationarity
def check_stationarity(prices):
    """
    Perform the Augmented Dickey-Fuller test to check for stationarity.
    """
    result = adfuller(prices.dropna())
    print('ADF Statistic:', result[0])
    print('p-value:', result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print(f'   {key}: {value}')
    if result[1] <= 0.05:
        print("The series is stationary.")
    else:
        print("The series is non-stationary.")

# Function to make data stationary
def make_stationary(prices):
    """
    Difference the prices to make the series stationary.
    """
    return prices.diff().dropna()

# Check and make data stationary for a sample stock
sample_stock = adjusted_close_prices.columns[0]
print(f"Checking stationarity for {sample_stock}:")
check_stationarity(adjusted_close_prices[sample_stock])

stationary_prices = make_stationary(adjusted_close_prices[sample_stock])
print(f"Checking stationarity after differencing for {sample_stock}:")
check_stationarity(stationary_prices)

# ------------------------------------------------------------------------------------
# Step 2: Fit ARIMA to Each Stock and Extract Residuals
# ------------------------------------------------------------------------------------

# Function to fit ARIMA and extract residuals
def fit_arima_and_get_residuals(prices, order=(1, 1, 0)):
    """
    Fit an ARIMA model to the prices and return the residuals.
    """
    model = ARIMA(prices, order=order, dates=prices.index)
    results = model.fit(start_params=[0.1, 0.1, 0.1])  # Custom starting parameters
    residuals = results.resid
    return residuals

# Dictionary to store residuals for each stock
residuals_dict = {}

# Loop through each stock to fit ARIMA and extract residuals
for stock in adjusted_close_prices.columns:
    prices = adjusted_close_prices[stock].dropna()
    stationary_prices = make_stationary(prices)
    residuals = fit_arima_and_get_residuals(stationary_prices, order=(1, 1, 0))
    residuals_dict[stock] = residuals

# Convert residuals to a DataFrame
residuals_df = pd.DataFrame(residuals_dict)

# Display the residuals DataFrame
print("\nResiduals Shape:", residuals_df.shape)
print("\nResiduals Preview:")
print(residuals_df.head())

# ------------------------------------------------------------------------------------
# Step 3: Use Residuals as Input to LSTM
# ------------------------------------------------------------------------------------

# Normalize the residuals
scaler = MinMaxScaler(feature_range=(0, 1))
residuals_scaled = scaler.fit_transform(residuals_df)

# Function to create sequences for LSTM
def create_sequences(data, sequence_length=60):
    """
    Create sequences of data for LSTM input.
    """
    X_seq, y_seq = [], []
    for i in range(len(data) - sequence_length):
        X_seq.append(data[i:i+sequence_length])
        y_seq.append(data[i+sequence_length])
    return np.array(X_seq), np.array(y_seq)

sequence_length = 60  # Use 60 days of historical data to predict the next day
X_seq, y_seq = create_sequences(residuals_scaled, sequence_length)

# Split the data into training and testing sets
split = int(0.8 * len(X_seq))
X_train, X_test = X_seq[:split], X_seq[split:]
y_train, y_test = y_seq[:split], y_seq[split:]

# Build the LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1]))  # Output layer with one unit per stock
model.compile(optimizer='adam', loss='mse')

# Train the LSTM model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test.reshape(-1), y_pred.reshape(-1))
print(f"Mean Squared Error: {mse}")

# Inverse transform the predictions to get actual residuals
predicted_residuals = scaler.inverse_transform(y_pred)

# Display the predicted residuals
print("\nPredicted Residuals Preview:")
print(predicted_residuals[:5])

# ------------------------------------------------------------------------------------
# Step 4: Combine ARIMA and LSTM Predictions
# ------------------------------------------------------------------------------------

# Function to get ARIMA predictions
def get_arima_predictions(prices, order=(1, 1, 0)):
    """
    Get ARIMA predictions for the prices.
    """
    model = ARIMA(prices, order=order, dates=prices.index)
    results = model.fit(start_params=[0.1, 0.1, 0.1])  # Custom starting parameters
    predictions = results.predict(start=0, end=len(prices)-1)
    return predictions

# Dictionary to store final predictions for each stock
final_predictions_dict = {}

# Loop through each stock to combine ARIMA and LSTM predictions
for i, stock in enumerate(adjusted_close_prices.columns):
    prices = adjusted_close_prices[stock].dropna()
    stationary_prices = make_stationary(prices)
    
    # Get ARIMA predictions
    arima_predictions = get_arima_predictions(stationary_prices, order=(1, 1, 0))
    
    # Get LSTM predictions for residuals
    lstm_predictions = predicted_residuals[:, i]
    
    # Combine ARIMA and LSTM predictions
    final_predictions = arima_predictions + lstm_predictions
    
    # Store the final predictions
    final_predictions_dict[stock] = final_predictions

# Convert final predictions to a DataFrame
final_predictions_df = pd.DataFrame(final_predictions_dict, index=residuals_df.index[split+sequence_length:])

# Display the final predictions
print("\nFinal Predictions Shape:", final_predictions_df.shape)
print("\nFinal Predictions Preview:")
print(final_predictions_df.head())

Checking stationarity for A:
ADF Statistic: -0.18599325917636922
p-value: 0.940170719549955
Critical Values:
   1%: -3.431731221687337
   5%: -2.862150343170579
   10%: -2.5670948876908053
The series is non-stationary.
Checking stationarity after differencing for A:
ADF Statistic: -14.718258256809422
p-value: 2.7779183210914663e-27
Critical Values:
   1%: -3.431731221687337
   5%: -2.862150343170579
   10%: -2.5670948876908053
The series is stationary.


