## Combining all data sets, adding missing value by ML

As we have many data sets with different periods and frequencies we add data to the datasets with less data by using ML methods.

First we will look for the dataset that have most data in period.

In [77]:
# import og nesecery libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import requests
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

## Reading SP500 and cleaning the data

In [78]:
# reading data from CSV
df_sp500 = pd.read_csv('/Users/youssefbadran/Documents/datamatiker/4. semester/BI/sp500_data.csv')

In [79]:
# Converting Date column to DateTime format
df_sp500['Date'] = pd.to_datetime(df_sp500['Date'])

# dropping unnecesery columns
df_sp500 = df_sp500.drop(columns=['Ticker', 'Volume', 'Adj Close', 'High','Low', 'Open'])

df_sp500.rename(columns={'Close':'Close SP500'}, inplace=True)

In [80]:
df_sp500

Unnamed: 0,Date,Close SP500
0,1962-01-02,3.475125
1,1962-01-03,3.501254
2,1962-01-04,3.501254
3,1962-01-05,3.409804
4,1962-01-08,3.390207
...,...,...
4239718,2024-09-06,189.229996
4239719,2024-09-09,191.729996
4239720,2024-09-10,191.360001
4239721,2024-09-11,189.990005


In [81]:
df_gold = pd.read_csv('/Users/youssefbadran/Documents/GitHub/bi-exam-project-stock/datasets/cleaned_gold_data.csv')

In [82]:
df_gold['Date'] = pd.to_datetime(df_gold['Date'])

df_gold = df_gold.drop(columns=(['Open', 'High', 'Low', 'Change %']))

df_gold.rename(columns={'Price':'Close Gold'}, inplace=True)

## Reading Inflation and interest and cleaning the data

In [83]:
df_interest_inflation = pd.read_csv('https://raw.githubusercontent.com/badranyoussef/bi-exam-project-stock/main/datasets/fed_interest_rate_inflation.csv')
df_interest_2017_to_now = pd.read_excel('/Users/youssefbadran/Documents/GitHub/bi-exam-project-stock/datasets/interest_rate_2017_now_cleaned.xlsx')

In [84]:
df_interest_inflation

Unnamed: 0,Year,Month,Day,Federal Funds Target Rate,Federal Funds Upper Target,Federal Funds Lower Target,Effective Federal Funds Rate,Real GDP (Percent Change),Unemployment Rate,Inflation Rate
0,1954,7,1,,,,0.80,4.6,5.8,
1,1954,8,1,,,,1.22,,6.0,
2,1954,9,1,,,,1.06,,6.1,
3,1954,10,1,,,,0.85,8.0,5.7,
4,1954,11,1,,,,0.83,,5.3,
...,...,...,...,...,...,...,...,...,...,...
899,2016,12,14,,0.75,0.50,,,,
900,2017,1,1,,0.75,0.50,0.65,,4.8,2.3
901,2017,2,1,,0.75,0.50,0.66,,4.7,2.2
902,2017,3,1,,0.75,0.50,,,,


In [85]:
df_interest_2017_to_now

Unnamed: 0,Effective Date,Rate (%)
0,09/16/2024,5.33
1,09/13/2024,5.33
2,09/12/2024,5.33
3,09/11/2024,5.33
4,09/10/2024,5.33
...,...,...
1932,01/09/2017,0.66
1933,01/06/2017,0.66
1934,01/05/2017,0.66
1935,01/04/2017,0.66


In [86]:
# drop all columns we don't need
df_interest_inflation_dropped = df_interest_inflation.drop(columns=['Federal Funds Target Rate', 'Federal Funds Upper Target', 'Federal Funds Lower Target', 'Real GDP (Percent Change)', 'Unemployment Rate'])

# Combine the columns Year, Month, Day into one DateTime column
df_interest_inflation_dropped['Date'] = pd.to_datetime(df_interest_inflation_dropped[['Year', 'Month', 'Day']])

# Insert the new column at the beginning
df_interest_inflation_dropped.insert(0, 'Date', df_interest_inflation_dropped.pop('Date'))

# Drop Year, Month and Day
df_interest_inflation_dropped = df_interest_inflation_dropped.drop(columns=['Year', 'Month', 'Day'])

In [87]:
df_interest_inflation_dropped.ffill(inplace=True)
df_interest_inflation_dropped.bfill(inplace=True)

df_interest_inflation_dropped.info()
print(df_interest_inflation_dropped)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 904 entries, 0 to 903
Data columns (total 3 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   Date                          904 non-null    datetime64[ns]
 1   Effective Federal Funds Rate  904 non-null    float64       
 2   Inflation Rate                904 non-null    float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 21.3 KB
          Date  Effective Federal Funds Rate  Inflation Rate
0   1954-07-01                          0.80             3.2
1   1954-08-01                          1.22             3.2
2   1954-09-01                          1.06             3.2
3   1954-10-01                          0.85             3.2
4   1954-11-01                          0.83             3.2
..         ...                           ...             ...
899 2016-12-14                          0.54             2.2
900 2017-01-01    

In [88]:
df_interest_inflation_dropped.rename(columns={'Effective Federal Funds Rate':'Interest Rate'}, inplace=True)

df_interest_inflation_dropped

Unnamed: 0,Date,Interest Rate,Inflation Rate
0,1954-07-01,0.80,3.2
1,1954-08-01,1.22,3.2
2,1954-09-01,1.06,3.2
3,1954-10-01,0.85,3.2
4,1954-11-01,0.83,3.2
...,...,...,...
899,2016-12-14,0.54,2.2
900,2017-01-01,0.65,2.3
901,2017-02-01,0.66,2.2
902,2017-03-01,0.66,2.2


In [89]:
# removing all columns unless date and 
df_interest_2017_to_now1 = df_interest_2017_to_now.filter(items=['Effective Date', 'Rate (%)'])

# Convert the current column with date to a column with datetime data type and drop the 'Effective Date'
df_interest_2017_to_now1['Date'] = pd.to_datetime(df_interest_2017_to_now1['Effective Date'])
df_interest_2017_to_now1 = df_interest_2017_to_now1.drop(columns=['Effective Date'])

In [90]:
df_interest_2017_to_now1.rename(columns={'Rate (%)':'Interest Rate'}, inplace=True)

df_interest_2017_to_now1

Unnamed: 0,Interest Rate,Date
0,5.33,2024-09-16
1,5.33,2024-09-13
2,5.33,2024-09-12
3,5.33,2024-09-11
4,5.33,2024-09-10
...,...,...
1932,0.66,2017-01-09
1933,0.66,2017-01-06
1934,0.66,2017-01-05
1935,0.66,2017-01-04


Now i will combine the to interest dataframes. as one of them goes to 2017 and the other one goes fram 2017 till 2024

In [91]:
# combining the dataframes with 'Interest Rate'
df_interest_combined = pd.concat([df_interest_inflation_dropped, df_interest_2017_to_now1])

# Sorting after 'Date'
df_interest_combined = df_interest_combined.sort_values(by='Date').reset_index(drop=True)

# Removing duplicates of dates if overlaping
df_interest_combined = df_interest_combined.drop_duplicates(subset='Date')

df_interest_combined

Unnamed: 0,Date,Interest Rate,Inflation Rate
0,1954-07-01,0.80,3.2
1,1954-08-01,1.22,3.2
2,1954-09-01,1.06,3.2
3,1954-10-01,0.85,3.2
4,1954-11-01,0.83,3.2
...,...,...,...
2836,2024-09-10,5.33,
2837,2024-09-11,5.33,
2838,2024-09-12,5.33,
2839,2024-09-13,5.33,


Now merging the combinded DF with SP500

In [92]:
# Merge den kombinerede Interest Rate DataFrame med df_sp500 baseret på 'Date'
df_merged = pd.merge(df_interest_combined, df_sp500, on='Date', how='outer')

# Sortere den endelige DataFrame efter dato
df_merged = df_merged.sort_values(by='Date').reset_index(drop=True)

# Fjern duplikater baseret på dato og behold kun den første forekomst af hver dato
df_merged = df_merged.drop_duplicates(subset='Date', keep='first')


In [93]:
df_merged

Unnamed: 0,Date,Interest Rate,Inflation Rate,Close SP500
0,1954-07-01,0.80,3.2,
1,1954-08-01,1.22,3.2,
2,1954-09-01,1.06,3.2,
3,1954-10-01,0.85,3.2,
4,1954-11-01,0.83,3.2,
...,...,...,...,...
4238561,2024-09-10,5.33,,562.349976
4239062,2024-09-11,5.33,,115.650002
4239563,2024-09-12,5.33,,52.880001
4240064,2024-09-13,5.33,,


In [94]:
df_merged.isnull().sum()

Date                  0
Interest Rate     13287
Inflation Rate    15223
Close SP500         343
dtype: int64

Merging Gold into df_merged

In [95]:
# Merging the merged_df with df_gold based on 'Date'
df_merged = pd.merge(df_merged, df_gold, on='Date', how='outer')

In [96]:
df_merged.sample(5)

Unnamed: 0,Date,Interest Rate,Inflation Rate,Close SP500,Close Gold
15831,2022-05-26,0.83,,217.880005,1850.02
5187,1981-12-01,12.37,9.5,1.835938,402.5
12569,2009-12-23,,,6.733333,1087.2
762,1964-08-13,,,0.06608,
6304,1986-03-31,,,11.676374,338.1


In [97]:
duplicate_dates = df_merged[df_merged.duplicated(subset='Date')]
print(duplicate_dates)

Empty DataFrame
Columns: [Date, Interest Rate, Inflation Rate, Close SP500, Close Gold]
Index: []


Now we do the same with Oil and russell2000

In [98]:
#Reading data
russell2000_df = pd.read_csv('/Users/youssefbadran/Documents/GitHub/bi-exam-project-stock/datasets/russell_2000.csv')
oil_df = pd.read_csv('/Users/youssefbadran/Documents/GitHub/bi-exam-project-stock/datasets/BrentOilPrices.csv')

russell2000_df = russell2000_df.drop(columns=['Open', 'Low', 'High', 'Adj Close', 'Volume'])

oil_df['Date'] = pd.to_datetime(oil_df['Date'])
russell2000_df['Date'] = pd.to_datetime(russell2000_df['Date'])

russell2000_df.rename(columns={'Close':'Close Russell'}, inplace=True)
oil_df.rename(columns={'Price':'Close Oil'}, inplace=True)

  oil_df['Date'] = pd.to_datetime(oil_df['Date'])


In [99]:
russell2000_df

Unnamed: 0,Date,Close Russell
0,1987-09-10,168.970001
1,1987-09-11,170.539993
2,1987-09-14,170.429993
3,1987-09-15,169.199997
4,1987-09-16,168.919998
...,...,...
8516,2021-06-25,2334.399902
8517,2021-06-28,2322.340088
8518,2021-06-29,2308.840088
8519,2021-06-30,2310.550049


In [100]:
oil_df

Unnamed: 0,Date,Close Oil
0,1987-05-20,18.63
1,1987-05-21,18.45
2,1987-05-22,18.55
3,1987-05-25,18.60
4,1987-05-26,18.63
...,...,...
9006,2022-11-08,96.85
9007,2022-11-09,93.05
9008,2022-11-10,94.25
9009,2022-11-11,96.37


In [101]:
# Merging the merged_df with russell2000_df based on 'Date'
df_merged = pd.merge(df_merged, russell2000_df, on='Date', how='outer')

# Merging the merged_df with oil_df based on 'Date'
df_merged = pd.merge(df_merged, oil_df, on='Date', how='outer')

In [102]:
df_merged.sample(100)

Unnamed: 0,Date,Interest Rate,Inflation Rate,Close SP500,Close Gold,Close Russell,Close Oil
6861,1988-05-13,,,12.931344,451.12,142.169998,16.50
4664,1979-12-07,,,1.083333,,,
14841,2018-08-08,1.91,,347.609985,1213.61,1686.880005,70.71
11292,2005-02-22,,,1.458571,434.55,617.929993,47.60
5665,1983-10-05,,,3.447295,389.50,,
...,...,...,...,...,...,...,...
16123,2023-07-10,5.07,,294.970001,1924.99,,
789,1964-09-22,,,6.779796,,,
15752,2022-02-04,0.08,,554.710022,1807.49,,96.86
5928,1984-10-12,,,14.849952,338.90,,


## Cleaning data
We have loads of missing values. First we remove all rows which are before 1963 af we only have 1 variable with data before 1963
then we fill in data with Machine learning

data before working with missing values <br>
Date                  0<br>
Interest Rate     13604<br>
Inflation Rate    15540<br>
Close SP500         660<br>
Close Gold         4838<br>
Close Russell      7921<br>
Close Oil          7431<br>
dtype: int64

In [103]:
#Removing rows before 1963 as we only have data of one variable before that year
df_merged_filtered = df_merged[df_merged['Date'] >= '1963-01-01']

# Fill missing values in 'Interest Rate' and 'Inflation Rate' using forward fill
df_merged_filtered['Interest Rate'] = df_merged_filtered['Interest Rate'].ffill()
df_merged_filtered['Inflation Rate'] = df_merged_filtered['Inflation Rate'].ffill()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_filtered['Interest Rate'] = df_merged_filtered['Interest Rate'].ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_filtered['Inflation Rate'] = df_merged_filtered['Inflation Rate'].ffill()


In [104]:


df_merged_filtered.isnull().sum()

Date                 0
Interest Rate        0
Inflation Rate       0
Close SP500        565
Close Gold        4491
Close Russell     7574
Close Oil         7084
dtype: int64

In [105]:
# Fill missing values in 'Interest Rate' and 'Inflation Rate' using forward fill
df_merged_filtered['Interest Rate'] = df_merged_filtered['Interest Rate'].ffill()
df_merged_filtered['Inflation Rate'] = df_merged_filtered['Inflation Rate'].ffill()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_filtered['Interest Rate'] = df_merged_filtered['Interest Rate'].ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged_filtered['Inflation Rate'] = df_merged_filtered['Inflation Rate'].ffill()


In [106]:
# copying the df before use of ML

df_isnull_filled_ML = df_merged_filtered.copy()

df_isnull_filled_ML = df_isnull_filled_ML.drop(columns=('Date'))

In [111]:
df_isnull_filled_ML.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16095 entries, 347 to 16441
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Interest Rate   16095 non-null  float64
 1   Inflation Rate  16095 non-null  float64
 2   Close SP500     15530 non-null  float64
 3   Close Gold      11604 non-null  float64
 4   Close Russell   8521 non-null   float64
 5   Close Oil       9011 non-null   float64
dtypes: float64(6)
memory usage: 880.2 KB


In [114]:
# Define the target column with missing values
target_column = 'Close SP500'

# Prepare features by dropping the target column safely
features = df_isnull_filled_ML.drop(columns=[target_column], errors='ignore')  # Drop target column safely


In [115]:
# Convert 'Date' column to numeric (timestamp) if it exists
if 'Date' in features.columns:
    features['Date'] = features['Date'].astype('int64') // 10**9  # Convert to seconds since epoch

# Fill missing values in features with the mean for initial training
features.fillna(features.mean(), inplace=True)

In [118]:
# Check if target_column exists in the main DataFrame
if target_column not in df_isnull_filled_ML.columns:
    print(f"{target_column} does not exist in the DataFrame.")
else:
    # Print the target column name for debugging
    print(f"Processing target column: {target_column}")

    # Separate the rows with and without missing values in the target column
    X = features[features[target_column].notnull()]  # Use the DataFrame's column to filter
    y = df_isnull_filled_ML[target_column][df_isnull_filled_ML[target_column].notnull()]  # Same here

    # Print dimensions of X and y for debugging
    print(f"Dimensions of features (X) for {target_column}: {X.shape}")
    print(f"Dimensions of target (y) for {target_column}: {y.shape}")

    # Check if X and y are empty
    if X.empty or y.empty:
        print(f"No available data for target column: {target_column}")
    else:
        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Create and train the model
        model = RandomForestRegressor(n_estimators=100, random_state=42)

        # Fit the model
        model.fit(X_train, y_train)

        # Prepare the data for prediction (rows where the target is missing)
        missing_data = features[features[target_column].isnull()]

        # Fill in the missing values using the model
        predictions = model.predict(missing_data)

        # Add the predictions back into the original DataFrame
        df_isnull_filled_ML.loc[df_isnull_filled_ML[target_column].isnull(), target_column] = predictions

        # Optional: Evaluate the model on the test set
        y_pred = model.predict(X_test)
        print(f'Mean Squared Error for {target_column}:', mean_squared_error(y_test, y_pred))

# Optionally, check if there are any remaining missing values
print(df_isnull_filled_ML.isnull().sum())

Processing target column: Close SP500


KeyError: 'Close SP500'

In [126]:
df_merged_filtered.head()

Unnamed: 0,Date,Interest Rate,Inflation Rate,Close SP500,Close Gold,Close Russell,Close Oil
347,1963-01-01,2.92,1.0,,,,
348,1963-01-02,2.92,1.0,0.073859,,,
349,1963-01-03,2.92,1.0,0.171875,,,
350,1963-01-04,2.92,1.0,0.086924,,,
351,1963-01-07,2.92,1.0,0.08648,,,


In [105]:
corrmat = df_merged.corr

In [107]:
# Sæt størrelsen på plottet
plt.figure(figsize=(10, 8))

# Brug seaborn til at lave et heatmap
sns.heatmap(corrmat, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

# Vise plottet
plt.title('Correlation Matrix Heatmap')
plt.show()

ValueError: Must pass 2-d input. shape=()

<Figure size 1000x800 with 0 Axes>