In [5]:
import pandas as pd

# Load the CSV file, skipping the first row and treating the first proper row as the header
file_path = 'cleaned_stock_data.csv'
data = pd.read_csv(file_path, skiprows=1)

# Print the first few rows to verify correct loading
print("Initial Data Preview:")
print(data.head())

# The date column is the first column in the dataset
date_column = data.columns[0]

# Ensure the 'Date' column is in datetime format, using errors='coerce' to handle non-date values
data[date_column] = pd.to_datetime(data[date_column], format='%d/%m/%Y', errors='coerce')

# Print the first few rows to check the 'Date' conversion
print("Data after Date conversion:")
print(data.head())

# Check for missing values in the 'Date' column
missing_dates = data[date_column].isnull().sum()
print(f"Missing dates: {missing_dates}")

# Drop rows with invalid dates if any
data = data.dropna(subset=[date_column])

# Print the first few rows after dropping invalid dates
print("Data after dropping invalid dates:")
print(data.head())

# Check for duplicate rows
duplicate_rows = data.duplicated()
print(f"Duplicate rows: {duplicate_rows.sum()}")

# Drop duplicate rows if any
data = data.drop_duplicates()

# Display data types to ensure consistency
print("Data types after cleaning:")
print(data.dtypes)

# Save cleaned data
cleaned_file_path = 'cleaned_stock_data_cleaned.csv'
data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned data saved to {cleaned_file_path}")


Initial Data Preview:
       Ticker       AAPL       AMZN       BRK-B      GOOGL         JNJ  \
0        Date        NaN        NaN         NaN        NaN         NaN   
1  2019-01-02  37.793777  76.956497  202.800003  52.673550  109.850533   
2  2019-01-03  34.029243  75.014000  191.660004  51.214722  108.104958   
3  2019-01-04  35.481930  78.769501  195.199997  53.841705  109.919334   
4  2019-01-07  35.402943  81.475502  196.910004  53.734329  109.214211   

         JPM        META       MSFT      NVDA  ...       AAPL.5       AMZN.5  \
0        NaN         NaN        NaN       NaN  ...          NaN          NaN   
1  83.855164  135.401749  95.673470  3.378612  ...  148158800.0  159662000.0   
2  82.663452  131.469833  92.153801  3.174486  ...  365248800.0  139512000.0   
3  85.710884  137.667099  96.439835  3.377867  ...  234428400.0  183652000.0   
4  85.770477  137.766891  96.562813  3.556694  ...  219111200.0  159864000.0   

     BRK-B.5     GOOGL.5      JNJ.5       JPM.5     

# Downloading the yfinance data in a clean format.

In [6]:
import yfinance as yf
import pandas as pd

# List of stock symbols
stocks = ['AAPL', 'AMZN', 'BRK-B', 'GOOGL', 'JNJ', 'JPM', 'META', 'MSFT', 'NVDA', 'TSLA']

# Initialize an empty list to store DataFrames
data_frames = []

# Iterate through each stock and download the data
for stock in stocks:
    # Download historical data
    stock_data = yf.download(stock, start="2019-01-01", end="2023-12-31")
    
    # Add a column for the stock name
    stock_data['Stock'] = stock
    
    # Reset index to have 'Date' as a column
    stock_data.reset_index(inplace=True)
    
    # Reorder columns
    stock_data = stock_data[['Date', 'Stock', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']]
    
    # Append the DataFrame to the list
    data_frames.append(stock_data)

# Concatenate all DataFrames into a single DataFrame
formatted_data = pd.concat(data_frames, ignore_index=True)

# Save the formatted data to a new CSV file
formatted_data.to_csv('formatted_stock_data.csv', index=False)

print("Data download and formatting complete.")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Data download and formatting complete.


## Converting the Data from strings to number.

In [7]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = 'formatted_stock_data.csv'
data = pd.read_csv(file_path)

# Convert the columns to the appropriate numeric types
# Assuming the first two columns are 'Date' and 'Stock', and we don't need to convert them
columns_to_convert = data.columns[2:]  # Skip 'Date' and 'Stock' columns

for column in columns_to_convert:
    data[column] = pd.to_numeric(data[column], errors='coerce')

# Save the DataFrame back to a CSV file
output_file_path = 'formatted_stock_data_cleaned.csv'
data.to_csv(output_file_path, index=False)

print(f"Data cleaned and saved to {output_file_path}")


Data cleaned and saved to formatted_stock_data_cleaned.csv


## Calculating the RSI and MACD for the specified stocks within the given timeframe using the pandas_ta library

In [10]:
!pip install pandas_ta

Collecting pandas_ta
  Downloading pandas_ta-0.3.14b.tar.gz (115 kB)
     ---------------------------------------- 0.0/115.1 kB ? eta -:--:--
     --- ------------------------------------ 10.2/115.1 kB ? eta -:--:--
     ------------- ----------------------- 41.0/115.1 kB 393.8 kB/s eta 0:00:01
     ------------------------------------ 115.1/115.1 kB 838.8 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pandas_ta
  Building wheel for pandas_ta (setup.py): started
  Building wheel for pandas_ta (setup.py): finished with status 'done'
  Created wheel for pandas_ta: filename=pandas_ta-0.3.14b0-py3-none-any.whl size=218928 sha256=e698f8e3ab177d56f16ebce23e5faeb48f0962c5ce6716df3c32851d9afb1812
  Stored in directory: c:\users\aakas\appdata\local\pip\cache\wheels\7f\33\8b\50b245c5c65433cd8f5cb24ac15d97e5a3db2d41a8b6ae957d
Successfully built pandas_ta
Installing collected packages: p

In [11]:
import pandas as pd
import yfinance as yf
import pandas_ta as ta

# Define the list of tickers
tickers = ['AAPL', 'AMZN', 'BRK-B', 'GOOGL', 'JNJ', 'JPM', 'META', 'MSFT', 'NVDA', 'TSLA']

# Define the start and end dates
start_date = '2019-01-02'
end_date = '2023-12-29'

# Initialize an empty DataFrame to hold all the data
all_data = pd.DataFrame()

# Loop through each ticker and calculate indicators
for ticker in tickers:
    # Fetch historical stock data
    data = yf.download(ticker, start=start_date, end=end_date)
    
    # Calculate RSI and MACD
    data['RSI'] = ta.rsi(data['Close'])
    data['MACD'], data['MACD_Signal'], data['MACD_Hist'] = ta.macd(data['Close'])
    
    # Add a column for the ticker
    data['Ticker'] = ticker
    
    # Reset index to have Date as a column
    data.reset_index(inplace=True)
    
    # Append to the all_data DataFrame
    all_data = pd.concat([all_data, data], axis=0)

# Save the merged data to a CSV file
all_data.to_csv('stock_data_with_indicators.csv', index=False)

# Display the first few rows of the dataframe
print(all_data.head())


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


        Date       Open       High        Low      Close  Adj Close  \
0 2019-01-02  38.722500  39.712502  38.557499  39.480000  37.793781   
1 2019-01-03  35.994999  36.430000  35.500000  35.547501  34.029240   
2 2019-01-04  36.132500  37.137501  35.950001  37.064999  35.481926   
3 2019-01-07  37.174999  37.207500  36.474998  36.982498  35.402946   
4 2019-01-08  37.389999  37.955002  37.130001  37.687500  36.077839   

      Volume  RSI          MACD    MACD_Signal      MACD_Hist Ticker  
0  148158800  NaN  MACD_12_26_9  MACDh_12_26_9  MACDs_12_26_9   AAPL  
1  365248800  NaN  MACD_12_26_9  MACDh_12_26_9  MACDs_12_26_9   AAPL  
2  234428400  NaN  MACD_12_26_9  MACDh_12_26_9  MACDs_12_26_9   AAPL  
3  219111200  NaN  MACD_12_26_9  MACDh_12_26_9  MACDs_12_26_9   AAPL  
4  164101200  NaN  MACD_12_26_9  MACDh_12_26_9  MACDs_12_26_9   AAPL  


## Calculating RSI and MACD again, the last result did not worked well

In [13]:
import pandas as pd
import yfinance as yf
import pandas_ta as ta

# Define the list of tickers
tickers = ['AAPL', 'AMZN', 'BRK-B', 'GOOGL', 'JNJ', 'JPM', 'META', 'MSFT', 'NVDA', 'TSLA']

# Define the start and end dates
start_date = '2019-01-02'
end_date = '2023-12-29'

# Initialize an empty DataFrame to hold all the data
all_data = pd.DataFrame()

# Loop through each ticker and calculate indicators
for ticker in tickers:
    # Fetch historical stock data
    data = yf.download(ticker, start=start_date, end=end_date)
    
    # Calculate RSI and MACD
    data['RSI'] = ta.rsi(data['Close'])
    macd = ta.macd(data['Close'])
    
    # Add MACD, MACD_Signal, and MACD_Hist to the DataFrame
    data = pd.concat([data, macd], axis=1)
    
    # Add a column for the ticker
    data['Ticker'] = ticker
    
    # Reset index to have Date as a column
    data.reset_index(inplace=True)
    
    # Append to the all_data DataFrame
    all_data = pd.concat([all_data, data], axis=0)

# Save the merged data to a CSV file
all_data.to_csv('stock_data_with_indicators.csv', index=False)

# Display the first few rows of the dataframe
print(all_data.head())


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


        Date       Open       High        Low      Close  Adj Close  \
0 2019-01-02  38.722500  39.712502  38.557499  39.480000  37.793781   
1 2019-01-03  35.994999  36.430000  35.500000  35.547501  34.029240   
2 2019-01-04  36.132500  37.137501  35.950001  37.064999  35.481926   
3 2019-01-07  37.174999  37.207500  36.474998  36.982498  35.402946   
4 2019-01-08  37.389999  37.955002  37.130001  37.687500  36.077839   

      Volume  RSI  MACD_12_26_9  MACDh_12_26_9  MACDs_12_26_9 Ticker  
0  148158800  NaN           NaN            NaN            NaN   AAPL  
1  365248800  NaN           NaN            NaN            NaN   AAPL  
2  234428400  NaN           NaN            NaN            NaN   AAPL  
3  219111200  NaN           NaN            NaN            NaN   AAPL  
4  164101200  NaN           NaN            NaN            NaN   AAPL  


## Using Forward Fill to Handle Missing Values

In [15]:
import pandas as pd
import yfinance as yf
import pandas_ta as ta

# Define the list of tickers
tickers = ['AAPL', 'AMZN', 'BRK-B', 'GOOGL', 'JNJ', 'JPM', 'META', 'MSFT', 'NVDA', 'TSLA']

# Define the start and end dates
start_date = '2019-01-02'
end_date = '2023-12-29'

# Initialize an empty DataFrame to hold all the data
all_data = pd.DataFrame()

# Loop through each ticker and calculate indicators
for ticker in tickers:
    # Fetch historical stock data
    data = yf.download(ticker, start=start_date, end=end_date)
    
    # Calculate RSI and MACD
    data['RSI'] = ta.rsi(data['Close'])
    macd = ta.macd(data['Close'])
    
    # Add MACD, MACD_Signal, and MACD_Hist to the DataFrame
    data = pd.concat([data, macd], axis=1)
    
    # Add a column for the ticker
    data['Ticker'] = ticker
    
    # Forward fill and backward fill missing values
    data.ffill(inplace=True)
    data.bfill(inplace=True)
    
    # Reset index to have Date as a column
    data.reset_index(inplace=True)
    
    # Append to the all_data DataFrame
    all_data = pd.concat([all_data, data], axis=0)

# Save the merged data to a CSV file
all_data.to_csv('stock_data_with_indicators_filled.csv', index=False)

# Display the first few rows of the dataframe
print(all_data.head())


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


        Date       Open       High        Low      Close  Adj Close  \
0 2019-01-02  38.722500  39.712502  38.557499  39.480000  37.793781   
1 2019-01-03  35.994999  36.430000  35.500000  35.547501  34.029240   
2 2019-01-04  36.132500  37.137501  35.950001  37.064999  35.481926   
3 2019-01-07  37.174999  37.207500  36.474998  36.982498  35.402946   
4 2019-01-08  37.389999  37.955002  37.130001  37.687500  36.077839   

      Volume        RSI  MACD_12_26_9  MACDh_12_26_9  MACDs_12_26_9 Ticker  
0  148158800  48.544578      2.016478      -0.274112       1.742551   AAPL  
1  365248800  48.544578      2.016478      -0.274112       1.742551   AAPL  
2  234428400  48.544578      2.016478      -0.274112       1.742551   AAPL  
3  219111200  48.544578      2.016478      -0.274112       1.742551   AAPL  
4  164101200  48.544578      2.016478      -0.274112       1.742551   AAPL  


### Calculating the other two indicators, Bollinger Bands and Moving Average (MA)

In [17]:
import pandas as pd
import yfinance as yf
import pandas_ta as ta

# Define the list of tickers
tickers = ['AAPL', 'AMZN', 'BRK-B', 'GOOGL', 'JNJ', 'JPM', 'META', 'MSFT', 'NVDA', 'TSLA']

# Define the start and end dates
start_date = '2019-01-02'
end_date = '2023-12-29'

# Initialize an empty DataFrame to hold all the data
all_data = pd.DataFrame()

# Loop through each ticker and calculate indicators
for ticker in tickers:
    # Fetch historical stock data
    data = yf.download(ticker, start=start_date, end=end_date)
    
    # Calculate RSI and MACD
    data['RSI'] = ta.rsi(data['Close'])
    macd = ta.macd(data['Close'])
    
    # Add MACD, MACD_Signal, and MACD_Hist to the DataFrame
    data = pd.concat([data, macd], axis=1)
    
    # Calculate Bollinger Bands
    bollinger = ta.bbands(data['Close'], length=20, std=2)
    
    # Add Bollinger Bands to the DataFrame
    data = pd.concat([data, bollinger], axis=1)
    
    # Calculate Moving Average
    data['MA'] = ta.sma(data['Close'], length=20)
    
    # Add a column for the ticker
    data['Ticker'] = ticker
    
    # Forward fill and backward fill missing values
    data.ffill(inplace=True)
    data.bfill(inplace=True)
    
    # Reset index to have Date as a column
    data.reset_index(inplace=True)
    
    # Append to the all_data DataFrame
    all_data = pd.concat([all_data, data], axis=0)

# Save the merged data to a CSV file
all_data.to_csv('stock_data_with_all_indicators.csv', index=False)

# Display the first few rows of the dataframe
print(all_data.head())


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


        Date       Open       High        Low      Close  Adj Close  \
0 2019-01-02  38.722500  39.712502  38.557499  39.480000  37.793781   
1 2019-01-03  35.994999  36.430000  35.500000  35.547501  34.029240   
2 2019-01-04  36.132500  37.137501  35.950001  37.064999  35.481926   
3 2019-01-07  37.174999  37.207500  36.474998  36.982498  35.402946   
4 2019-01-08  37.389999  37.955002  37.130001  37.687500  36.077839   

      Volume        RSI  MACD_12_26_9  MACDh_12_26_9  MACDs_12_26_9  \
0  148158800  48.544578      2.016478      -0.274112       1.742551   
1  365248800  48.544578      2.016478      -0.274112       1.742551   
2  234428400  48.544578      2.016478      -0.274112       1.742551   
3  219111200  48.544578      2.016478      -0.274112       1.742551   
4  164101200  48.544578      2.016478      -0.274112       1.742551   

   BBL_20_2.0  BBM_20_2.0  BBU_20_2.0  BBB_20_2.0  BBP_20_2.0         MA  \
0   36.117574   38.388125   40.658676   11.829444    1.143979  38.3881

## Check for any data errors:

- Check for missing values: Ensure there are no missing values in the dataset.
- Check for duplicate rows: Ensure there are no duplicate rows in the dataset.
- Check for outliers: Identify any extreme values that may not make sense.
- Data types: Ensure all columns have the correct data types.
- Here is the code to perform these checks:

In [24]:
!pip install ace

Collecting ace
  Downloading ace-0.3.3-py3-none-any.whl.metadata (7.8 kB)
Downloading ace-0.3.3-py3-none-any.whl (23 kB)
Installing collected packages: ace
Successfully installed ace-0.3.3


In [29]:
import pandas as pd

# Load the dataset
file_path = 'stock_data_with_all_indicators.csv'
data = pd.read_csv(file_path)

# 1. Check for missing values
missing_values = data.isnull().sum()
print("Missing values per column:\n", missing_values)

# 2. Check for duplicate rows
duplicate_rows = data.duplicated().sum()
print("Number of duplicate rows: ", duplicate_rows)

# 3. Check for outliers
# Here, we use the IQR method to detect outliers in numeric columns
def detect_outliers(df):
    outliers = {}
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outlier_indices = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))].index
        outliers[col] = outlier_indices
    return outliers

outliers = detect_outliers(data)
print("Outliers detected in the following columns:\n", {k: len(v) for k, v in outliers.items()})

# 4. Check data types
data_types = data.dtypes
print("Data types of each column:\n", data_types)

# Optional: If there are any issues, you can handle them accordingly
# For example, you can fill missing values or drop duplicates
# data.fillna(method='ffill', inplace=True)
# data.drop_duplicates(inplace=True)

# Save the cleaned dataset
cleaned_file_path = 'cleaned_stock_data_with_indicators.csv'
data.to_csv(cleaned_file_path, index=False)

# Display the first few rows of the cleaned dataframe
# Since ace_tools is not available in your environment, we will use the standard display
# or print function to show the dataframe's head

print("First few rows of the cleaned dataset:\n", data.head())

print("Cleaned dataset saved to:", cleaned_file_path)


Missing values per column:
 Date             0
Open             0
High             0
Low              0
Close            0
Adj Close        0
Volume           0
RSI              0
MACD_12_26_9     0
MACDh_12_26_9    0
MACDs_12_26_9    0
BBL_20_2.0       0
BBM_20_2.0       0
BBU_20_2.0       0
BBB_20_2.0       0
BBP_20_2.0       0
MA               0
Ticker           0
dtype: int64
Number of duplicate rows:  0
Outliers detected in the following columns:
 {'Open': 57, 'High': 53, 'Low': 54, 'Close': 54, 'Adj Close': 31, 'Volume': 1467, 'RSI': 13, 'MACD_12_26_9': 1161, 'MACDh_12_26_9': 1370, 'MACDs_12_26_9': 1188, 'BBL_20_2.0': 51, 'BBM_20_2.0': 30, 'BBU_20_2.0': 68, 'BBB_20_2.0': 698, 'BBP_20_2.0': 0, 'MA': 30}
Data types of each column:
 Date              object
Open             float64
High             float64
Low              float64
Close            float64
Adj Close        float64
Volume             int64
RSI              float64
MACD_12_26_9     float64
MACDh_12_26_9    float64
MACD

## Summary of Findings:
- Missing Values: There are no missing values in the dataset.
- Duplicate Rows: There are no duplicate rows in the dataset.
- Outliers: Outliers have been detected in multiple columns.
- Data Types: The data types of each column are as expected.
## Handling Outliers

- Handle Outliers: Using the IQR method to handle outliers.
- Save the Cleaned Data: Save the cleaned dataset after handling outliers.
- Proceed with Model Training: Use the cleaned dataset to train the model.

In [27]:
import pandas as pd

# Load the dataset
file_path = 'cleaned_stock_data_with_indicators.csv'  # Use the correct path to your CSV file
data = pd.read_csv(file_path)

# Function to remove outliers using the IQR method
def remove_outliers(df):
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        df = df[~((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR)))]
    return df

# Remove outliers
cleaned_data = remove_outliers(data)

# Save the cleaned dataset
cleaned_file_path = 'cleaned_stock_data_with_indicators.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)

# Display the first few rows of the cleaned dataframe
cleaned_data.head()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,RSI,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,BBL_20_2.0,BBM_20_2.0,BBU_20_2.0,BBB_20_2.0,BBP_20_2.0,MA,Ticker
4,2019-01-11,38.220001,38.424999,37.877499,38.072498,36.446396,108092800,48.544578,2.016478,-0.274112,1.742551,36.117574,38.388125,40.658676,11.829444,1.143979,38.388125,AAPL
6,2019-01-15,37.567501,38.3475,37.512501,38.267502,36.633068,114843600,48.544578,2.016478,-0.274112,1.742551,36.117574,38.388125,40.658676,11.829444,1.143979,38.388125,AAPL
7,2019-01-16,38.27,38.970001,38.25,38.735001,37.080601,122278800,48.544578,2.016478,-0.274112,1.742551,36.117574,38.388125,40.658676,11.829444,1.143979,38.388125,AAPL
8,2019-01-17,38.549999,39.415001,38.314999,38.965,37.30077,119284800,48.544578,2.016478,-0.274112,1.742551,36.117574,38.388125,40.658676,11.829444,1.143979,38.388125,AAPL
10,2019-01-22,39.102501,39.182499,38.154999,38.325001,36.688118,121576000,48.544578,2.016478,-0.274112,1.742551,36.117574,38.388125,40.658676,11.829444,1.143979,38.388125,AAPL


### Fetching data for training the model for different timeframes:
#### I am fetching it separately so I can include every symbol.

In [34]:
import pandas as pd
import yfinance as yf
import pandas_ta as ta

# Define the list of tickers
tickers = ['AAPL', 'AMZN', 'BRK-B', 'GOOGL', 'JNJ', 'JPM', 'META', 'MSFT', 'NVDA', 'TSLA']

# Define the start and end dates
start_date = '2024-01-01'
end_date = '2024-07-31'

# Initialize an empty DataFrame to hold all the data
all_data = pd.DataFrame()

# Loop through each ticker and calculate indicators
for ticker in tickers:
    # Fetch historical stock data
    data = yf.download(ticker, start=start_date, end=end_date)
    
    # Calculate RSI and MACD
    data['RSI'] = ta.rsi(data['Close'])
    macd = ta.macd(data['Close'])
    
    # Add MACD, MACD_Signal, and MACD_Hist to the DataFrame
    data = pd.concat([data, macd], axis=1)
    
    # Calculate Bollinger Bands
    bollinger = ta.bbands(data['Close'], length=20, std=2)
    
    # Add Bollinger Bands to the DataFrame
    data = pd.concat([data, bollinger], axis=1)
    
    # Calculate Moving Average
    data['MA'] = ta.sma(data['Close'], length=20)
    
    # Add a column for the ticker
    data['Ticker'] = ticker
    
    # Forward fill and backward fill missing values
    data.ffill(inplace=True)
    data.bfill(inplace=True)
    
    # Reset index to have Date as a column
    data.reset_index(inplace=True)
    
    # Append to the all_data DataFrame
    all_data = pd.concat([all_data, data], axis=0)

# Save the merged data to a CSV file
all_data.to_csv('new_stock_data_with_all_indicators.csv', index=False)

# Display the first few rows of the dataframe
print(all_data.head())


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


        Date        Open        High         Low       Close   Adj Close  \
0 2024-01-02  187.149994  188.440002  183.889999  185.639999  185.152283   
1 2024-01-03  184.220001  185.880005  183.429993  184.250000  183.765945   
2 2024-01-04  182.149994  183.089996  180.880005  181.910004  181.432098   
3 2024-01-05  181.990005  182.759995  180.169998  181.179993  180.703995   
4 2024-01-08  182.089996  185.600006  181.500000  185.559998  185.072495   

     Volume        RSI  MACD_12_26_9  MACDh_12_26_9  MACDs_12_26_9  \
0  82488700  74.450966      0.877487      -1.106851       0.142733   
1  58414500  74.450966      0.877487      -1.106851       0.142733   
2  71983600  74.450966      0.877487      -1.106851       0.142733   
3  62303300  74.450966      0.877487      -1.106851       0.142733   
4  59144500  74.450966      0.877487      -1.106851       0.142733   

   BBL_20_2.0  BBM_20_2.0  BBU_20_2.0  BBB_20_2.0  BBP_20_2.0          MA  \
0  179.047189  187.890498  196.733808    9.41

### Clean the New Data
#### Ensuring the new data is in the same format as my existing training data and calculate the indicators (RSI, MACD, Bollinger Bands, Moving Averages).

In [None]:
!pip install TA-Lib

In [None]:
import talib

# Load the new data
new_data = pd.read_csv('new_stock_data.csv')

# Ensure 'Date' column is in datetime format
new_data['Date'] = pd.to_datetime(new_data['Date'])

# Calculate RSI
new_data['RSI'] = new_data.groupby('Ticker')['Close'].transform(lambda x: talib.RSI(x, timeperiod=14))

# Calculate MACD
new_data['MACD'], new_data['MACD_Signal'], new_data['MACD_Hist'] = new_data.groupby('Ticker')['Close'].apply(
    lambda x: talib.MACD(x, fastperiod=12, slowperiod=26, signalperiod=9)).transform(list).unstack()

# Calculate Bollinger Bands
new_data['BB_Upper'], new_data['BB_Middle'], new_data['BB_Lower'] = new_data.groupby('Ticker')['Close'].apply(
    lambda x: talib.BBANDS(x, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)).transform(list).unstack()

# Calculate Moving Averages
new_data['MA'] = new_data.groupby('Ticker')['Close'].transform(lambda x: talib.SMA(x, timeperiod=20))

# Fill missing values if any
new_data.fillna(method='ffill', inplace=True)
new_data.fillna(method='bfill', inplace=True)

# Save the cleaned new data
new_data.to_csv('cleaned_new_stock_data.csv', index=False)


### Check and Clean the Data

In [35]:
import pandas as pd

# Load the dataset
file_path = 'new_stock_data_with_all_indicators.csv'
data = pd.read_csv(file_path)

# 1. Check for missing values
missing_values = data.isnull().sum()
print("Missing values per column:\n", missing_values)

# 2. Check for duplicate rows
duplicate_rows = data.duplicated().sum()
print("Number of duplicate rows: ", duplicate_rows)

# 3. Check for outliers
# Here, we use the IQR method to detect outliers in numeric columns
def detect_outliers(df):
    outliers = {}
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outlier_indices = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))].index
        outliers[col] = outlier_indices
    return outliers

outliers = detect_outliers(data)
print("Outliers detected in the following columns:\n", {k: len(v) for k, v in outliers.items()})

# 4. Check data types
data_types = data.dtypes
print("Data types of each column:\n", data_types)

# Handling missing values (if any)
data.fillna(method='ffill', inplace=True)
data.fillna(method='bfill', inplace=True)

# Dropping duplicate rows (if any)
data.drop_duplicates(inplace=True)

# Save the cleaned dataset
cleaned_file_path = 'cleaned_new_stock_data_with_indicators.csv'
data.to_csv(cleaned_file_path, index=False)

# Display the first few rows of the cleaned dataframe
print(data.head())


Missing values per column:
 Date             0
Open             0
High             0
Low              0
Close            0
Adj Close        0
Volume           0
RSI              0
MACD_12_26_9     0
MACDh_12_26_9    0
MACDs_12_26_9    0
BBL_20_2.0       0
BBM_20_2.0       0
BBU_20_2.0       0
BBB_20_2.0       0
BBP_20_2.0       0
MA               0
Ticker           0
dtype: int64
Number of duplicate rows:  0
Outliers detected in the following columns:
 {'Open': 0, 'High': 0, 'Low': 0, 'Close': 0, 'Adj Close': 0, 'Volume': 166, 'RSI': 18, 'MACD_12_26_9': 110, 'MACDh_12_26_9': 124, 'MACDs_12_26_9': 113, 'BBL_20_2.0': 0, 'BBM_20_2.0': 0, 'BBU_20_2.0': 0, 'BBB_20_2.0': 155, 'BBP_20_2.0': 0, 'MA': 0}
Data types of each column:
 Date              object
Open             float64
High             float64
Low              float64
Close            float64
Adj Close        float64
Volume             int64
RSI              float64
MACD_12_26_9     float64
MACDh_12_26_9    float64
MACDs_12_26_9    

  data.fillna(method='ffill', inplace=True)
  data.fillna(method='bfill', inplace=True)


The data is well-prepared with no missing values and only a few outliers. Now, I will train the model using AutoML with the cleaned and enriched dataset.