DEFINING FILEPATH AND IMPORTS

In [None]:
### IMPORTS
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan, het_goldfeldquandt, het_white
from scipy import stats

### IMPORT FILES FROM DATA_FUNCTIONS - PROVIDED IN PROJET FOLDER
### MAKE SURE TO PUT YOUR FILE PATH HERE
### FOR EXAMPLE - r'C:\Users\rest of your path to the folder labled data functions'
sys.path.insert(0, r'YOUR FILEPATH TO THE DATA_FUNCTIONS FOLDER')
import data_combine, data_financial_import, data_financial_CSVimport, data_indicator_bollingerbands, data_indicator_movingaverage, data_indicator_pricechange, data_indicator_tradingvolume, data_indicator_volatility, data_outliers_by_column, data_standardize_timeseries, data_transformations

DEFINE DATE RANGE AND TICKER

In [None]:
ticker = 'NVDA' 
start_date = '2019-01-01'
end_date = '2022-01-01'

GET DATA FROM USER PROVIDED FILES

In [None]:
### LIST ONLY USER PROVIDED CSV FILES WITH ECONOMIC DATA - LEAVE ECONOMIC_DATA.CSV IN THE LIST IF YOU ARE USING DATA FROM IT.
### ANY FILES IN THE LIST MUST BE INDEXED BY DATE

csv_files_userProvided = ['economic_data.csv']

for csv_file in csv_files_userProvided:
    data_financial_CSVimport.get_economic_data(csv_file, start_date, end_date)

GET YFINANCE DATA AND CALCULATE INDICATORS

In [None]:
data_financial_import.download_stock_data(ticker, start_date, end_date)
data_indicator_bollingerbands.calculate_bollinger_bands(ticker, start_date, end_date)
data_indicator_movingaverage.calculate_moving_averages(ticker, start_date, end_date)
data_indicator_pricechange.calculate_price_change(ticker, start_date, end_date)
data_indicator_tradingvolume.calculate_trading_volume(ticker, start_date, end_date)
data_indicator_volatility.calculate_volatility(ticker, start_date, end_date)

STANDARDIZE DATA - HERE YOU CHOOSE THE FREQUENCY FOR ALL OF THE GATHERED DATA

--- ALL CSV FILES SHOULD BE OVER THE SAME DATE RANGE.

In [None]:
### ONLY ADD USER PROVEDED CSV FILES TO THIS LIST
### ECONOMIC_DATA.CSV AND INDICATOR/STOCK DATA ARE ALREADY INCLUDED 

csv_files = [f'{ticker}_data.csv', 'selected_economic_data.csv', f'indicator_{ticker}_bollinger_bands.csv', f'indicator_{ticker}_moving_averages.csv', f'indicator_{ticker}_price_change.csv', f'indicator_{ticker}_trading_volume.csv', f'indicator_{ticker}_volatility.csv']

### DEFINE FREQUENCY AND FILL METHOD
frequency = 'weekly'  # Replace with your desired frequency
fill_method = 'fill_avg'  # Replace with your desired fill method

new_files = []
for csv_file in csv_files:
    new_file = data_standardize_timeseries.standardize_data(csv_file, frequency, fill_method)
    new_files.append(new_file)

COMBINE THE STANDARDIZED CSVS TO HAVE ALL DATA IN ONE CSV ORGANIZED BY COLUMN

In [None]:
data_combine.merge_csv_files(new_files, 'variable_data_menu.csv')

VARIABLE SELECTION

---HERE IS WHERE YOU SHOULD COME BACK TO MAKE CHANGES TO YOUR VARIABLE CHOICES IF NEEDED.              
---MAKE SURE TO RUN ALL MODULES FOLLOWING THIS ONE AFTER MAKING VARIABLE CHANHGES.

In [None]:
df = pd.read_csv('variable_data_menu.csv')

dependent_variable = 'Close'  # Replace with your actual column title
independent_variables = ['Daily Change','Unemployment Rate', 'rf_13 WEEK']  # Replace with your actual column titles

index_column = 'Date'  # CHOOSE A COLUMN TO INDEX YOUR DATA



y = df[[dependent_variable]]
X = df[independent_variables]
index_data = df[index_column]
y.set_index(index_data, inplace=True)
X.set_index(index_data, inplace=True)
y.to_csv('dependent_variable_data.csv')
X.to_csv('independent_variable_data.csv')

SPLIT INDEPENDENT AND DEPENDENT VARIABLE DATA INTO TRAIN AND TEST DATA SETS

In [None]:
dependent_variable_data = pd.read_csv('dependent_variable_data.csv', index_col=0)
independent_variable_data = pd.read_csv('independent_variable_data.csv', index_col=0)
X_train, X_test, y_train, y_test = train_test_split(independent_variable_data, dependent_variable_data, test_size=0.2, random_state=42)

### SAVE THE TRAINING AND TEST SETS TO NEW CSV FILES
X_train.to_csv('train_independent_variable_data.csv')
X_test.to_csv('test_independent_variable_data.csv')
y_train.to_csv('train_dependent_variable_data.csv')
y_test.to_csv('test_dependent_variable_data.csv')

REGRESSION DIAGNOSTICS

In [None]:
X_train = pd.read_csv('train_independent_variable_data.csv', index_col=0)
y_train = pd.read_csv('train_dependent_variable_data.csv', index_col=0)

### FIT THE MODEL
model = sm.OLS(y_train, sm.add_constant(X_train))
results = model.fit()
print(results.summary())
print()
print()
                                            ### HOW TO INTERPRET THE MODEL SUMMARY:

### R-squared: This is the coefficient of determination. It tells you the proportion of the variance in the dependent variable that can be explained by the independent variables.
### R-squared values range from 0 to 1. An R-squared of 100% indicates that all changes in the dependent variable are completely explained by changes in the independent variables.

### Adjusted R-squared: This is the R-squared adjusted based on the number of predictors in the model. 
### It only increases if the new predictor improves the model more than would be expected by chance. It decreases when a predictor improves the model by less than expected by chance.

### F-statistic: This is a good indicator of whether there is a relationship between the predictor and the response variable. The further the F-statistic is from 1, the better it is.

### coef: These are the coefficients of the independent variables in the regression equation.

### std err: These are the standard errors of the coefficients. The standard errors can be used to compute confidence intervals and to carry out hypothesis tests for the coefficients.

### P > |t|: These are the p-values associated with the null hypothesis that the coefficient equals zero (no effect). 
### A low p-value (< 0.05) indicates that you can reject the null hypothesis.

### Confidence Interval: These are the 95% confidence intervals for the coefficients. If zero is not in the interval, it suggests the coefficient is significantly different from zero.



###                                               CHECKING OLS ASSUMPTIONS
print('CHECKING OLS ASSUMPTIONS:')
print()
print()
### Linearity: A scatter plot of observed vs. predicted values is used to check the linearity assumption. 
###             If the relationship is linear, the points should roughly form a straight line.
plt.scatter(y_train, results.predict())
plt.xlabel('Observed')
plt.ylabel('Predicted')
plt.title('Check for Linearity')
print('Linearity: A scatter plot of observed vs. predicted values is used to check the linearity assumption.')
print('If the relationship is linear, the points should roughly form a straight line.')
plt.show()
print()
### Independence: The Durbin-Watson test is used to check the independence assumption. 
###                 The test statistic is approximately equal to 2*(1-r), where r is the sample autocorrelation of the residuals. 
###                 Thus, for r == 0, indicating no serial correlation, the test statistic equals 2.
print('Independence: The Durbin-Watson test is used to check the independence assumption.')
print('The test statistic is approximately equal to 2*(1-r), where r is the sample autocorrelation of the residuals.')
print('Thus, for r == 0, indicating no serial correlation, the test statistic equals 2.')
print('Durbin-Watson test statistic:', sm.stats.durbin_watson(results.resid))
print()
### Homoscedasticity:A scatter plot of residuals vs. predicted values is used to check the homoscedasticity assumption. 
###                  If the variance of the errors is constant across all levels of the independent variables, the plot should show a cloud of points without a clear pattern.
plt.scatter(results.predict(), results.resid)
plt.xlabel('Predicted')
plt.ylabel('Residual')
plt.title('Check for Homoscedasticity')
print('Homoscedasticity: A scatter plot of residuals vs. predicted values is used to check the homoscedasticity assumption.')
print('If the variance of the errors is constant across all levels of the independent variables, the plot should show a cloud of points without a clear pattern.')
plt.show()
print()
### Multicollinearity: The Variance Inflation Factor (VIF) is used to check the multicollinearity assumption. 
###                     VIF quantifies how much the variance is inflated due to multicollinearity. 
###                     VIF of 1 indicates no correlation, while VIF exceeding 5 or 10 indicates high multicollinearity.
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['features'] = X_train.columns
print('Variance Inflation Factor (VIF) is used to check the multicollinearity assumption.')
print('VIF of 1 indicates no correlation, while VIF exceeding 5 or 10 indicates high multicollinearity.')
print()
print(vif)
print()
### Normality: A QQ plot of the residuals and the Shapiro-Wilk test are used to check the normality assumption. 
###            If the errors are normally distributed, the points in the QQ plot should roughly fall on a straight line. 
###            The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.
sm.qqplot(results.resid, line='s')
plt.title('Check for Normality')
print('Normality: A QQ plot of the residuals and the Shapiro-Wilk test are used to check the normality assumption.')
print('If the errors are normally distributed, the points in the QQ plot should roughly fall on a straight line.')
plt.show()
print()
print('The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.')
print('Shapiro-Wilk test:', stats.shapiro(results.resid))
print()
print()



###                                           CHECKING FOR NORMALITY IN ERROR TERMS
print('CHECKING FOR NORMALITY IN ERROR TERMS!')
print()
print()
### Histogram of Residuals: This plot shows the distribution of residuals. It’s used to check the assumption of normality in the error terms. 
### Ideally, the residuals should be normally distributed, which means the histogram should resemble a bell curve. If the residuals are not normally distributed, it 
### may suggest that linear regression is not the best model for the data.
plt.hist(results.resid, bins=30)
plt.xlabel('Residual')
plt.title('Histogram of Residuals')
print('Histogram of Residuals: This plot shows the distribution of residuals. It is used to check the assumption of normality in the error terms.')
print('If the data is not bell curve shaped then then the assumption of normally distributed errors was false.')
print()
plt.show()
print()
print()



###                                             CHECKING FOR HETEROSCEDASTICITY
### The three statistical tests used to check for heteroskedasticity are the Breusch-Pagan test, Goldfeld-Quandt test, and White test. These tests have null 
### hypothesis that the error variances are all equal (homoscedasticity) and alternative hypothesis that the error variances are not equal (heteroskedasticity).
print('CHECKING FOR HETEROSKEDASTICITY:')
print()
print()
bp_test = het_breuschpagan(results.resid, results.model.exog)
print('The Breusch-Pagan test is a statistical procedure that returns four key values. The first is the Lagrange multiplier statistic, which serves as the test statistic.') 
print('The second is the p-value of the Lagrange multiplier test. If this value is less than your chosen significance level, it indicates') 
print('that you should reject the null hypothesis of homoskedasticity. The third value is the f-value of the hypothesis that the error variance does not depend on x, providing') 
print('another test statistic. The fourth and final value is the p-value of the f statistic. Again, if this value is less than your chosen significance level,') 
print('it suggests that the null hypothesis of homoskedasticity should be rejected.')
print()
print(f'Breusch-Pagan test: {bp_test}')
print()
print()

gq_test = het_goldfeldquandt(y_train, X_train)
print('The Goldfeld-Quandt test is another statistical test that returns three values. The first is the value of the test statistic. If this value is significantly different')
print('from zero, it indicates the presence of heteroskedasticity. The second value is the p-value for a two-sided test for the null hypothesis that the variance of the ')
print('residuals is the same across the range of data. If this value is less than your chosen significance level, it suggests that the null hypothesis of homoskedasticity should be rejected.') 
print('The third value indicates whether the test statistic is positive increasing or negative decreasing, which can provide insight into the nature of the heteroskedasticity.')
print()
print(f'Goldfeld-Quandt test: {gq_test}')
print()
print()

white_test = het_white(results.resid, results.model.exog)
print('The White test is a statistical procedure that also returns four values. The first is the White test statistic. The second is the p-value for the test statistic.')
print('If this value is less than your chosen significance level, it indicates that you should reject the null hypothesis of homoskedasticity. The third value is ')
print('the F-statistic of the hypothesis that the error variance does not depend on x, providing another test statistic. The fourth and final value is the p-value for the')
print('F-statistic. Again, if this value is less than your chosen significance level, it suggests that the null hypothesis of homoskedasticity should be rejected.')
print()
print(f'White test: {white_test}')


--- IF YOU NEED TO MAKE CHANGES TO VARIABLE CHOICE REMEMBER THAT IT NEEDS TO BE DONE IN THE VARIABLE SELECTION MODULE ABOVE. 

--- MAKE SURE TO RUN THAT MODULE, THE MODULE TO SPLIT THE DATA, AND RUN THE DIAGNOSTIC MODULE AGAIN SO THAT THE REGRESSION GETS RUN AGAIN WITH THE CHANGES.

DEFINE ERROR TYPE AND GET REGRESSION RESULTS

--- AFTER MAKING ALL VARIABLE CHANGES YOU CAN SEE WHAT A CHANGE IN ERROR TYPE WILL DO TO THE MODEL. [IF NEEDED/INDICATED BY THE HETEROSKEDASTICITY TESTS]

In [None]:
error_type = 'HC0'      # 'HC0', 'HC1', 'HC2', 'HC3' or None for OLS

### HC0: This is the basic White estimator and can be used when there is heteroscedasticity.
### HC1: This adjusts the HC0 estimator for degrees of freedom and is useful when the sample size is small.
### HC2: This adjusts for leverage, which can be helpful when there are outliers in the predictor variables.
### HC3: This further adjusts for extreme leverage points and can be beneficial when there are extreme outliers in the predictor variables.

if error_type in ['HC0', 'HC1', 'HC2', 'HC3']:
    robust_results = results.get_robustcov_results(cov_type=error_type)
    print('ROBUST ERROR OLS REGRESSION RESULTS:')
    print(robust_results.summary())

WHEN DONE WORKING WITH THE TRAIN DATA RUN THE REGRESSION ON THE TEST DATA SET AND MAKE SURE THE RESULTS ARE AS EXPECTED

In [None]:
X_test = pd.read_csv('test_independent_variable_data.csv', index_col=0)
y_test = pd.read_csv('test_dependent_variable_data.csv', index_col=0)

# FIT MODEL ON TEST DATA
model = sm.OLS(y_test, sm.add_constant(X_test))
results = model.fit()
print(results.summary())
print()

#DEFINE ERROR TYPE 
#MAKE SURE IT IS THE SAME ONE YOU DECIDED WAS BEST FOR TRAINING DATA
testdata_error_type = 'HC0'  #'HC0', 'HC1', 'HC2', 'HC3' or None for OLS

print()
#GET TEST DATA REGRESSION RESULTS (WITH ROBUST ERROR RESULTS AS WELL WHEN APPLICABLE)
if testdata_error_type in ['HC0', 'HC1', 'HC2', 'HC3']:
    robust_results = results.get_robustcov_results(cov_type=testdata_error_type)
    print('ROBUST ERROR OLS REGRESSION RESULTS:')
    print(robust_results.summary())


IF THERE ARE STILL ISSUES RUNNING THE REGRESSION BECAUSE OF THE DATA THEN CHECK OUT THE .PY FILE WITH DATA TRANSFORMATIONS THAT SOMETIMES CAN MAKE A VARIABLE USEABLE IN A LINEAR REGRESSION. 

YOU CAN TRANSFORM THE DATA WITH THE FUNCTIONS IN THE DATA_TRANSFORMATIONS.PY FILE. APPLY TRANSFORMATION TO COLUMNS THEN TO USE IT IN THE REGRESSION INCORPORATE IT IN THE USER PROVIDED FILES AND RUN THE WHOLE PROGRAM AGAIN.

DIRECTION REGARDING WHEN EACH TRANSFORMATION SHOULD BE USED IS IN THE .PY FILE UNDER THE FUNCTIONS.

ALSO MORE REGRESSION MODELS WILL BE COMING SOON TO MY GITHUB FOR CASES WHERE OLS CONDITIONS CAUSE RESULTS TO BE INACCURATE. 
WILL HAVE POLY REGRESSION, POISSON REGRESSION, AND PANEL DATA REGRESSION WITH FIXED AND RANDOM EFFECTS