<a href="https://colab.research.google.com/github/allisonlinn/CSUREMM/blob/main/financial_data_analysis_ml_arima.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
import pandas as pd

# Upload CSV file from local machine
uploaded = files.upload()

# Get the file name
file_name = next(iter(uploaded))

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(file_name)

df = df.drop(df.tail(2).index)

print(df.tail())

Saving detrended_normalized_financial_data_2014_2023 - all_detrended_normalized (1).csv to detrended_normalized_financial_data_2014_2023 - all_detrended_normalized (1) (4).csv
           date  compound  compound_detrended  VCDAX_V_detrended  VCDAX_V_N  \
2177  5/17/2023    0.6705            0.624579          -2.899964  -0.346201   
2178  5/24/2023   -0.6513           -0.697221          -1.340144  -0.159988   
2179  5/25/2023    0.7717            0.725779          -1.543324  -0.184244   
2180  5/26/2023   -0.4588           -0.504721           1.869497   0.223183   
2181  5/19/2023   -0.5106           -0.556521          -2.567683  -0.306533   

      VCSAX_V_detrended  VCSAX_V_N  VENAX_V_detrended  VENAX_V_N  \
2177          -0.038722  -0.005571           1.721613   0.144542   
2178           0.238856   0.034365          -4.875521  -0.409335   
2179          -0.949566  -0.136616          -1.191654  -0.100048   
2180           0.026012   0.003742          -1.351787  -0.113492   
2181     

# 10-Year Analysis

**Granger Causality**

In [None]:
from statsmodels.tsa.stattools import grangercausalitytests
import pandas as pd
from google.colab import files

# Create an empty DataFrame to store the Granger causality test results
granger_results_l = pd.DataFrame(columns=['fund name', 'lag', 'p-value'])

# Define the list of columns to calculate correlations and perform Granger causality test with
columns_to_analyze = [
    'VCDAX_V_N','VCSAX_V_N', 'VENAX_V_N','VFAIX_V_N', 'VGSLX_V_N','VGHCX_V_N',	'VITAX_V_N',	'VSPVX_V_N','VTCAX_V_N','VUIAX_V_N', 'VINAX_V_N',	'VUIAX_P_N',	'VGHCX_P_N', 'VFAIX_P_N', 'VSPVX_P_N', 'VITAX_P_N', 'VGSLX_P_N','VINAX_P_N',	'VTCAX_P_N','VCSAX_P_N','VENAX_P_N','VCDAX_P_N'
    ]

# Perform Granger causality test for each column
for column in columns_to_analyze:

  # Perform Granger causality test with different lags
  max_lag = 5  # Maximum lag to test
  granger_test_results_l = grangercausalitytests(df[[column, 'compound']], maxlag=max_lag, verbose=False)

  # Iterate over the lag values and add significant Granger causality results to the DataFrame
  for lag in range(1, max_lag + 1):
      p_value = granger_test_results_l[lag][0]['ssr_ftest'][1]
      if p_value < 0.05:
          result = {
              'fund name': column,
              'lag': lag,
              'p-value': p_value
          }
          granger_results_l = pd.concat([granger_results_l, pd.DataFrame(result, index=[0])])


# Export the Granger causality test results DataFrame to a CSV file
granger_results_l.to_csv('granger_results_longer.csv', index=False)
files.download('granger_results_longer.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Pearson R Correlation**

In [None]:
from scipy.stats import pearsonr
import pandas as pd

# Create an empty DataFrame to store the statistically significant results
pearson_results_l = pd.DataFrame(columns=['fund name', 'p-value', 'correlation coefficient'])

# Perform Pearson correlation analysis for each column
for column in columns_to_analyze:
    try:
        # Extract the relevant series and the target variable
        X = df['compound']
        y = df[column]

        # Calculate Pearson correlation coefficient and p-value
        correlation, p_value = pearsonr(X, y)

        # Add the significant result to the DataFrame
        result = {
            'fund name': column,
            'p-value': p_value,
            'correlation coefficient': correlation
            }
        pearson_results_l = pd.concat([pearson_results_l, pd.DataFrame(result, index=[0])])
    except ValueError:
        # Handle the ValueError when x and y have length less than 2
        pass

# Export the significant results DataFrame to a CSV file
pearson_results_l.to_csv('pearson_results_l.csv', index=False)
files.download('pearson_results_l.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Machine Learning**

In [None]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
import pandas as pd
from google.colab import files

# Create an empty DataFrame to store the ML results
ml_results_l = pd.DataFrame(columns=['fund name', 'model', 'mape', 'rmse'])

# Run machine learning models on each column
for column in columns_to_analyze:
    X = df['compound']
    y = df[column]

    if X.empty or y.empty:
        continue

    # Support Vector Regression
    svr = SVR()
    svr.fit(X.values.reshape(-1, 1), y.values.reshape(-1, 1))
    svr_pred = svr.predict(X.values.reshape(-1, 1))
    svr_mape = mean_absolute_percentage_error(y, svr_pred)
    svr_rmse = mean_squared_error(y, svr_pred, squared=False)

    # Random Forest Regression
    rf = RandomForestRegressor()
    rf.fit(X.values.reshape(-1, 1), y.values.reshape(-1, 1))
    rf_pred = rf.predict(X.values.reshape(-1, 1))
    rf_mape = mean_absolute_percentage_error(y, rf_pred)
    rf_rmse = mean_squared_error(y, rf_pred, squared=False)

    # Gradient Boosting Regression
    gbr = GradientBoostingRegressor()
    gbr.fit(X.values.reshape(-1, 1), y.values.reshape(-1, 1))
    gbr_pred = gbr.predict(X.values.reshape(-1, 1))
    gbr_mape = mean_absolute_percentage_error(y, gbr_pred)
    gbr_rmse = mean_squared_error(y, gbr_pred, squared=False)

    # LightGBM Regression
    lgb_reg = lgb.LGBMRegressor()
    lgb_reg.fit(X.values.reshape(-1, 1), y.values.reshape(-1, 1))
    lgb_pred = lgb_reg.predict(X.values.reshape(-1, 1))
    lgb_mape = mean_absolute_percentage_error(y, lgb_pred)
    lgb_rmse = mean_squared_error(y, lgb_pred, squared=False)

    # XGBoost Regression
    xgb_reg = xgb.XGBRegressor()
    xgb_reg.fit(X.values.reshape(-1, 1), y.values.reshape(-1, 1))
    xgb_pred = xgb_reg.predict(X.values.reshape(-1, 1))
    xgb_mape = mean_absolute_percentage_error(y, xgb_pred)
    xgb_rmse = mean_squared_error(y, xgb_pred, squared=False)

    # Add the ML results to the DataFrame
    result = pd.DataFrame([
        {
            'fund name': column,
            'model': 'Support Vector Regression',
            'mape': svr_mape,
            'rmse': svr_rmse
        },
        {
            'fund name': column,
            'model': 'Random Forest Regression',
            'mape': rf_mape,
            'rmse': rf_rmse
        },
        {
            'fund name': column,
            'model': 'Gradient Boosting Regression',
            'mape': gbr_mape,
            'rmse': gbr_rmse
        },
        {
            'fund name': column,
            'model': 'LightGBM Regression',
            'mape': lgb_mape,
            'rmse': lgb_rmse
        },
        {
            'fund name': column,
            'model': 'XGBoost Regression',
            'mape': xgb_mape,
            'rmse': xgb_rmse
        }
    ])

    ml_results_l = pd.concat([ml_results_l, result])

# Export the ML results DataFrame to a CSV file
ml_results_l.to_csv('ml_results_longer.csv', index=False)
files.download('ml_results_longer.csv')


# Seasonal Analysis

**Granger Causality**

In [None]:
from statsmodels.tsa.stattools import grangercausalitytests
import pandas as pd
from google.colab import files

# Create an empty DataFrame to store the Granger causality test results
granger_results = pd.DataFrame(columns=['year', 'season', 'fund name', 'lag', 'p-value'])

# Get unique combinations of year and season
year_season_combos = df[['year', 'season']].drop_duplicates()

# Iterate over each year-season combo
for index, combo in year_season_combos.iterrows():
    year = combo['year']
    season = combo['season']

    # Filter data for the current year-season combo
    filtered_data = df[(df['year'] == year) & (df['season'] == season)]

    # Define the list of columns to calculate correlations and perform Granger causality test with
    columns_to_analyze = [
        'VCDAX_V_N','VCSAX_V_N', 'VENAX_V_N','VFAIX_V_N', 'VGSLX_V_N','VGHCX_V_N',	'VITAX_V_N',	'VSPVX_V_N','VTCAX_V_N','VUIAX_V_N', 'VINAX_V_N',	'VUIAX_P_N',	'VGHCX_P_N', 'VFAIX_P_N', 'VSPVX_P_N', 'VITAX_P_N', 'VGSLX_P_N','VINAX_P_N',	'VTCAX_P_N','VCSAX_P_N','VENAX_P_N','VCDAX_P_N'
        ]

    # Perform Granger causality test for each column
    for column in columns_to_analyze:
        try:

            # Perform Granger causality test with different lags
            max_lag = 5  # Maximum lag to test
            granger_test_results = grangercausalitytests(filtered_data[[column, 'compound']], maxlag=max_lag, verbose=False)

            # Iterate over the lag values and add significant Granger causality results to the DataFrame
            for lag in range(1, max_lag + 1):
                p_value = granger_test_results[lag][0]['ssr_ftest'][1]
                if p_value < 0.05:
                    result = {
                        'year': year,
                        'season': season,
                        'fund name': column,
                        'lag': lag,
                        'p-value': p_value
                    }
                    granger_results = pd.concat([granger_results, pd.DataFrame(result, index=[0])])

        except ValueError:
            # Handle the ValueError when x and y have length less than 2
            pass

# Export the Granger causality test results DataFrame to a CSV file
granger_results.to_csv('granger_results_shorter.csv', index=False)
files.download('granger_results_shorter.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Pearson R Correlation**

In [None]:
from scipy.stats import pearsonr
import pandas as pd

# Create an empty DataFrame to store the statistically significant results
pearson_results = pd.DataFrame(columns=['year', 'season', 'fund name', 'p-value', 'correlation coefficient'])

# Get unique combinations of year and season
year_season_combos = df[['year', 'season']].drop_duplicates()

# Iterate over each year-season combo
for index, combo in year_season_combos.iterrows():
    year = combo['year']
    season = combo['season']

    # Filter data for the current year-season combo
    filtered_data = df[(df['year'] == year) & (df['season'] == season)]

    # Perform Pearson correlation analysis for each column
    for column in columns_to_analyze:
        try:
            # Extract the relevant series and the target variable
            X = filtered_data['compound']
            y = filtered_data[column]

            # Calculate Pearson correlation coefficient and p-value
            correlation, p_value = pearsonr(X, y)

            # Check if the correlation is statistically significant
            if p_value < 0.05:
                # Add the significant result to the DataFrame
                result = {
                    'year': year,
                    'season': season,
                    'fund name': column,
                    'p-value': p_value,
                    'correlation coefficient': correlation
                }
                pearson_results = pd.concat([pearson_results, pd.DataFrame(result, index=[0])])
        except ValueError:
            # Handle the ValueError when x and y have length less than 2
            pass

# Export the significant results DataFrame to a CSV file
pearson_results.to_csv('pearson_results_shorter.csv', index=False)
files.download('pearson_results_shorter.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Machine Learning: SVR, LGB, XGB, GBR, Random Forest**

In [None]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
import pandas as pd
from google.colab import files

# Create an empty DataFrame to store the ML results
ml_results = pd.DataFrame(columns=['year', 'season', 'fund name', 'model', 'mape', 'rmse'])

# Get unique combinations of year and season
year_season_combos = df[['year', 'season']].drop_duplicates()

# Iterate over each year-season combo
for index, combo in year_season_combos.iterrows():
    year = combo['year']
    season = combo['season']

    # Filter data for the current year-season combo
    filtered_data = df[(df['year'] == year) & (df['season'] == season)]

    # Run machine learning models on each column
    for column in columns_to_analyze:
        X = filtered_data['compound']
        y = filtered_data[column]

        if X.empty or y.empty:
            continue

        # Support Vector Regression
        svr = SVR()
        svr.fit(X.values.reshape(-1, 1), y.values.reshape(-1, 1))
        svr_pred = svr.predict(X.values.reshape(-1, 1))
        svr_mape = mean_absolute_percentage_error(y, svr_pred)
        svr_rmse = mean_squared_error(y, svr_pred, squared=False)

        # Random Forest Regression
        rf = RandomForestRegressor()
        rf.fit(X.values.reshape(-1, 1), y.values.reshape(-1, 1))
        rf_pred = rf.predict(X.values.reshape(-1, 1))
        rf_mape = mean_absolute_percentage_error(y, rf_pred)
        rf_rmse = mean_squared_error(y, rf_pred, squared=False)

        # Gradient Boosting Regression
        gbr = GradientBoostingRegressor()
        gbr.fit(X.values.reshape(-1, 1), y.values.reshape(-1, 1))
        gbr_pred = gbr.predict(X.values.reshape(-1, 1))
        gbr_mape = mean_absolute_percentage_error(y, gbr_pred)
        gbr_rmse = mean_squared_error(y, gbr_pred, squared=False)

        # LightGBM Regression
        lgb_reg = lgb.LGBMRegressor()
        lgb_reg.fit(X.values.reshape(-1, 1), y.values.reshape(-1, 1))
        lgb_pred = lgb_reg.predict(X.values.reshape(-1, 1))
        lgb_mape = mean_absolute_percentage_error(y, lgb_pred)
        lgb_rmse = mean_squared_error(y, lgb_pred, squared=False)

        # XGBoost Regression
        xgb_reg = xgb.XGBRegressor()
        xgb_reg.fit(X.values.reshape(-1, 1), y.values.reshape(-1, 1))
        xgb_pred = xgb_reg.predict(X.values.reshape(-1, 1))
        xgb_mape = mean_absolute_percentage_error(y, xgb_pred)
        xgb_rmse = mean_squared_error(y, xgb_pred, squared=False)

        # Add the ML results to the DataFrame
        result = pd.DataFrame([
            {
                'year': year,
                'season': season,
                'fund name': column,
                'model': 'Support Vector Regression',
                'mape': svr_mape,
                'rmse': svr_rmse
            },
            {
                'year': year,
                'season': season,
                'fund name': column,
                'model': 'Random Forest Regression',
                'mape': rf_mape,
                'rmse': rf_rmse
            },
            {
                'year': year,
                'season': season,
                'fund name': column,
                'model': 'Gradient Boosting Regression',
                'mape': gbr_mape,
                'rmse': gbr_rmse
            },
            {
                'year': year,
                'season': season,
                'fund name': column,
                'model': 'LightGBM Regression',
                'mape': lgb_mape,
                'rmse': lgb_rmse
            },
            {
                'year': year,
                'season': season,
                'fund name': column,
                'model': 'XGBoost Regression',
                'mape': xgb_mape,
                'rmse': xgb_rmse
            }
        ])

        ml_results = pd.concat([ml_results, result])

# Export the ML results DataFrame to a CSV file
ml_results.to_csv('ml_results_shorter.csv', index=False)
files.download('ml_results_shorter.csv')
