In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from datetime import datetime, timedelta

In [2]:
datastocks = pd.read_csv('data_snp_stocks.csv')
datastocks['date'] = pd.to_datetime(datastocks['date'], format = 'mixed')
#datastocks.set_index('date', inplace=True)
tickers = datastocks['Name'].unique()

In [3]:
datastocks

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2,2013-02-12,14.45,14.51,14.10,14.27,8126000,AAL
3,2013-02-13,14.30,14.94,14.25,14.66,10259500,AAL
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL
...,...,...,...,...,...,...,...
619035,2018-02-01,76.84,78.27,76.69,77.82,2982259,ZTS
619036,2018-02-02,77.53,78.12,76.73,76.78,2595187,ZTS
619037,2018-02-05,76.64,76.92,73.18,73.83,2962031,ZTS
619038,2018-02-06,72.74,74.56,72.13,73.27,4924323,ZTS


In [12]:
snp500 = pd.read_csv('spx.csv',)
snp500['date'] = pd.to_datetime(snp500['date'], format='mixed')
snp500['MarketReturn'] = np.log(snp500['close']/snp500['close'].shift(1))
snp500 = snp500[(snp500['date'] >= pd.to_datetime('2013-02-11')) & (snp500['date'] <= pd.to_datetime('2018-02-07'))]
snp500

Unnamed: 0,date,close,MarketReturn
6835,2013-02-11,1517.01,-0.000606
6836,2013-02-12,1519.43,0.001594
6837,2013-02-13,1520.33,0.000592
6838,2013-02-14,1521.38,0.000690
6839,2013-02-15,1519.79,-0.001046
...,...,...,...
8088,2018-02-01,2821.98,-0.000648
8089,2018-02-02,2762.13,-0.021437
8090,2018-02-05,2648.94,-0.041843
8091,2018-02-06,2695.14,0.017291


In [5]:
datastocks.dtypes

date      datetime64[ns]
open             float64
high             float64
low              float64
close            float64
volume             int64
Name              object
dtype: object

In [6]:
datastocks.head()

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL


In [7]:
datastocks.isna().sum()

date       0
open      11
high       8
low        8
close      0
volume     0
Name       0
dtype: int64

In [8]:
#data.sort_values(by=['Name', 'date'], inplace=True)
datastocks['LogReturn'] = datastocks.groupby('Name')['close'].transform(lambda x: np.log(x / x.shift(1)))
datastocks['LogReturn'].isna().sum()
datastocks = datastocks[datastocks['LogReturn'].notna()]
data = datastocks.merge(snp500, left_on='date', right_on='date')
data

Unnamed: 0,date,open,high,low,close_x,volume,Name,LogReturn,close_y,MarketReturn
0,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL,-0.019857,1517.01,0.999394
1,2013-02-12,14.45,14.51,14.10,14.27,8126000,AAL,-0.013227,1519.43,1.001595
2,2013-02-13,14.30,14.94,14.25,14.66,10259500,AAL,0.026963,1520.33,1.000592
3,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL,-0.046780,1521.38,1.000691
4,2013-02-15,13.93,14.61,13.93,14.50,15628000,AAL,0.035806,1519.79,0.998955
...,...,...,...,...,...,...,...,...,...,...
618530,2018-02-01,76.84,78.27,76.69,77.82,2982259,ZTS,0.014106,2821.98,0.999352
618531,2018-02-02,77.53,78.12,76.73,76.78,2595187,ZTS,-0.013454,2762.13,0.978791
618532,2018-02-05,76.64,76.92,73.18,73.83,2962031,ZTS,-0.039179,2648.94,0.959021
618533,2018-02-06,72.74,74.56,72.13,73.27,4924323,ZTS,-0.007614,2695.14,1.017441


The equation for the rolling regression:

$r_i -r_f = \alpha_i + \beta_i (R_M - r_f) + e_i$

Where:

$r_i$ is the return of the stock $i$ \
$r_f$ is the return of the risk free asset \
$\alpha_i$ is the intercept of the regression, or the excess return \
$\beta_i$ is the regression coefficient \
$R_M$ is the return of the market (log-return) \
$e_i$ is the specific risk of stock $i$


In [9]:
df_coeff = pd.DataFrame(columns=['Ticker', 'alpha', 'alpha_significance', 'beta', 'r_squared', 'specific_risk', 'systematic_risk'])

In [None]:
for ticker in tickers:
    dataLoop = data[data['Name'] == ticker]

    # Drop rows with missing values
    dataLoop = dataLoop.dropna(subset=['LogReturn', 'MarketReturn'])
    if len(dataLoop) < 180:
        print(f"Skipping {ticker} due to insufficient data for rolling window.")
        continue
    for i in range(0, len(dataLoop)-180):
        y = dataLoop['LogReturn'].iloc[i:i+180].values
        X = dataLoop['MarketReturn'].iloc[i:i+180].values  # Use market returns as the independent variable
        X = sm.add_constant(X)

        model = sm.OLS(y, X)
        result = model.fit()

        df_coeff = pd.concat([df_coeff, pd.DataFrame({
            'Ticker': [ticker],
            'alpha': [result.params[0]],
            'alpha_significance': result.pvalues[0],
            'beta': [result.params[1]],
            'r_squared': [result.rsquared],
            'specific_risk': [result.mse_resid],
            'systematic_risk': [result.params[1] ** 2 * np.var(dataLoop['MarketReturn'])]
        })], ignore_index=True)

print(df_coeff)

In [19]:
result.resid[-1]

np.float64(-0.021853270184063054)

In [None]:
df_coeff.to_csv('data_snp_stocks_coefficients.csv', index=False)

In [2]:
df_coeff = pd.read_csv('data_snp_stocks_coefficients.csv')

In [3]:
df_coeff.groupby('Ticker').mean()

Unnamed: 0_level_0,alpha,beta,r_squared,specific_risk,systematic_risk
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,-0.000206,1.271370,0.403908,0.012294,0.000094
AAL,0.000306,1.515926,0.234912,0.019769,0.000141
AAP,-0.000193,0.926173,0.171729,0.016492,0.000052
AAPL,0.000484,0.962868,0.270297,0.012122,0.000059
ABBV,0.000150,1.107772,0.264815,0.013798,0.000074
...,...,...,...,...,...
XYL,0.000263,1.151586,0.406986,0.010439,0.000078
YUM,-0.000339,1.065282,0.262721,0.014893,0.000068
ZBH,-0.000045,0.984739,0.318331,0.010942,0.000057
ZION,-0.000118,1.418175,0.391371,0.012728,0.000121


In [4]:
df_coeff.to_csv('df_coeff.csv', index=False)