In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import yfinance as yf

In [5]:
def name_ticker_pair():
    pairs = {"silver":"SI=F", 
             "bac":"BAC", 
             "citi":"C", 
             "corn":"ZC=F", 
             "euro":"EURUSD=X", 
             "gold":"GC=F", 
             "iyr":"IYR", 
             "oil":"CL=F", 
             "pound":"GBPUSD=X", 
             "soybns":"ZS=F", 
             "tr5yr":"^FVX", 
             "tr10yr":"^TNX", 
             "wheat":"ZW=F", 
             "yen":"JPY=X"}
    return pairs

In [6]:
bac_yf = yf.download('BAC', start = '2010-01-05', end = '2024-01-22', progress=False)
citi_yf = yf.download('C', start = '2010-01-05', end = '2024-01-22', progress=False)

print(bac_yf.shape, citi_yf.shape)

(3534, 6) (3534, 6)


In [7]:
# get RND stats
bac_rnd = pd.read_csv('data/bac_3.csv')
citi_rnd = pd.read_csv('data/citi_3.csv')

print(bac_rnd.shape, citi_rnd.shape)

(613, 14) (613, 14)


In [8]:
# transform/clean data
bac_rnd['Date'] =pd.to_datetime(bac_rnd['idt'], format='%m/%d/%y')
bac_rnd = bac_rnd.set_index('Date')
bac_rnd.drop('idt',axis= 1, inplace=True)

citi_rnd['Date'] =pd.to_datetime(citi_rnd['idt'], format='%m/%d/%y')
citi_rnd = citi_rnd.set_index('Date')
citi_rnd.drop('idt',axis= 1, inplace=True)

In [9]:
# add two more extreme probability features
bac_rnd['dec_plus_inc'] = bac_rnd['prDec'] + bac_rnd['prInc']
bac_rnd['dec_minus_inc'] = bac_rnd['prDec'] - bac_rnd['prInc']

citi_rnd['dec_plus_inc'] = citi_rnd['prDec'] + citi_rnd['prInc']
citi_rnd['dec_minus_inc'] = citi_rnd['prDec'] - citi_rnd['prInc']


In [10]:
# for BAC:
fut_ret = pd.DataFrame()
for day in range(10,100,10):
    fut_ret[f'{day}day_ret'] =  (bac_yf['Close'].shift(-day) - bac_yf['Close'])/bac_yf['Close']
fut_ret

Unnamed: 0_level_0,10day_ret,20day_ret,30day_ret,40day_ret,50day_ret,60day_ret,70day_ret,80day_ret,90day_ret
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-01-05,0.017901,-0.041358,-0.019753,0.012346,0.054321,0.113580,0.136420,0.100617,0.008642
2010-01-06,-0.056132,-0.100061,-0.031116,0.018914,0.026236,0.106162,0.122026,0.101891,-0.002440
2010-01-07,-0.119906,-0.113999,-0.042528,-0.011223,0.001772,0.092144,0.099232,0.037212,-0.057885
2010-01-08,-0.107271,-0.137068,-0.050060,0.001192,0.020858,0.109654,0.089392,0.044696,-0.028010
2010-01-11,-0.127584,-0.145304,-0.035440,0.010632,0.037803,0.101595,0.095097,-0.038393,-0.096279
...,...,...,...,...,...,...,...,...,...
2024-01-12,,,,,,,,,
2024-01-16,,,,,,,,,
2024-01-17,,,,,,,,,
2024-01-18,,,,,,,,,


In [11]:
feature = ['mu', 'sd', 'skew', 'kurt', 'p10', 'p50','p90','prDec','prInc','dec_plus_inc', 'dec_minus_inc']
feature_df = bac_rnd[feature].copy()
feature_df.describe()

Unnamed: 0,mu,sd,skew,kurt,p10,p50,p90,prDec,prInc,dec_plus_inc,dec_minus_inc
count,613.0,613.0,613.0,613.0,613.0,613.0,613.0,613.0,613.0,613.0,613.0
mean,-0.005387,0.150914,-0.556849,1.051346,-0.198459,0.007936,0.168623,0.093237,0.065635,0.158873,0.027602
std,0.007952,0.048331,0.219521,0.460131,0.068411,0.010805,0.04778,0.045057,0.051612,0.094778,0.020125
min,-0.04696,0.09423,-1.19664,0.21382,-0.77608,-0.01725,0.11048,0.02736,0.00918,0.04054,-0.1411
25%,-0.00863,0.1201,-0.71889,0.68739,-0.22327,0.00216,0.1387,0.05957,0.03216,0.09332,0.02025
50%,-0.00421,0.13725,-0.54389,1.00943,-0.1793,0.00656,0.15459,0.08155,0.0489,0.13115,0.02798
75%,-0.00077,0.16771,-0.39342,1.31437,-0.15624,0.01234,0.18559,0.11847,0.08368,0.20218,0.03846
max,0.05456,0.47582,0.22005,2.65312,-0.11917,0.10345,0.49188,0.27237,0.38023,0.63836,0.07967


In [12]:
df = feature_df.merge(fut_ret, left_index=True,right_index=True,how='left')
df

Unnamed: 0_level_0,mu,sd,skew,kurt,p10,p50,p90,prDec,prInc,dec_plus_inc,dec_minus_inc,10day_ret,20day_ret,30day_ret,40day_ret,50day_ret,60day_ret,70day_ret,80day_ret,90day_ret
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2010-01-15,-0.02514,0.18210,-0.38173,0.85192,-0.25389,-0.01634,0.19272,0.14898,0.09307,0.24205,0.05591,-0.051661,-0.067651,0.012300,0.047356,0.092251,0.193112,0.093481,0.049815,-0.048585
2010-01-29,-0.02194,0.21265,-0.79513,1.61552,-0.28826,0.00001,0.21786,0.16750,0.12009,0.28759,0.04741,-0.048090,0.100790,0.110013,0.188406,0.229908,0.150856,0.130435,0.020422,-0.011199
2010-02-12,-0.01699,0.21625,-0.86181,1.83248,-0.28670,0.00720,0.22358,0.16404,0.12713,0.29117,0.03691,0.156401,0.166090,0.248443,0.292042,0.208997,0.187543,0.071972,0.038754,0.067820
2010-02-26,-0.00965,0.17893,-0.70108,1.40641,-0.23570,0.00776,0.19373,0.12882,0.09233,0.22115,0.03649,0.011405,0.074430,0.120048,0.083433,0.038415,-0.075630,-0.079832,-0.064826,-0.117047
2010-03-15,0.00006,0.17357,-0.71889,1.63480,-0.21657,0.01649,0.19608,0.11325,0.09545,0.20870,0.01780,0.070623,0.108012,0.036795,0.018398,-0.080712,-0.109199,-0.084273,-0.118101,-0.189318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-13,0.00352,0.11858,-0.55862,1.13971,-0.14633,0.01278,0.14065,0.05447,0.03384,0.08831,0.02063,0.057428,0.023720,,,,,,,
2023-12-20,0.00081,0.12808,-0.29318,0.71192,-0.16021,0.00599,0.15502,0.06324,0.05067,0.11391,0.01257,0.043966,,,,,,,,
2023-12-27,0.00169,0.12350,-0.40609,0.92177,-0.15376,0.00863,0.14759,0.05846,0.04262,0.10108,0.01584,-0.020390,,,,,,,,
2024-01-03,0.00161,0.12852,-0.40098,0.84767,-0.16048,0.00854,0.15460,0.06398,0.04890,0.11288,0.01508,-0.053683,,,,,,,,


In [14]:
corr_df = pd.DataFrame()
for fname in df.columns:
    if fname not in ['10day_ret', '20day_ret', '30day_ret', '40day_ret', '50day_ret','60day_ret','70day_ret','80day_ret','90day_ret']:
        s = pd.Series(dtype=float)
        s['c_10'] = round(df[fname].corr(df["10day_ret"]),3)
        s['c_20'] = round(df[fname].corr(df["20day_ret"]),3)
        s['c_30'] = round(df[fname].corr(df["30day_ret"]),3)
        s['c_40'] = round(df[fname].corr(df["40day_ret"]),3)
        s['c_50'] = round(df[fname].corr(df["50day_ret"]),3)
        s['c_60'] = round(df[fname].corr(df["60day_ret"]),3)
        s['c_70'] = round(df[fname].corr(df["70day_ret"]),3)
        s['c_80'] = round(df[fname].corr(df["80day_ret"]),3)
        s['c_90'] = round(df[fname].corr(df["90day_ret"]),3)
        corr_df[fname] = s

corr_df

Unnamed: 0,mu,sd,skew,kurt,p10,p50,p90,prDec,prInc,dec_plus_inc,dec_minus_inc
c_10,-0.033,0.059,0.056,-0.051,-0.058,-0.026,0.071,0.066,0.068,0.068,-0.025
c_20,0.016,0.103,0.08,-0.046,-0.091,0.011,0.131,0.102,0.124,0.116,-0.089
c_30,-0.015,0.133,0.053,-0.007,-0.121,0.008,0.158,0.141,0.151,0.149,-0.073
c_40,-0.075,0.139,0.039,0.014,-0.13,-0.032,0.153,0.162,0.149,0.158,-0.022
c_50,-0.108,0.177,0.031,0.016,-0.172,-0.021,0.188,0.197,0.183,0.193,-0.029
c_60,-0.118,0.188,0.036,0.007,-0.185,-0.019,0.198,0.205,0.193,0.203,-0.038
c_70,-0.108,0.181,0.042,0.002,-0.173,-0.025,0.193,0.197,0.186,0.195,-0.038
c_80,-0.111,0.197,0.045,-0.02,-0.192,-0.008,0.211,0.209,0.204,0.21,-0.059
c_90,-0.104,0.204,0.051,-0.029,-0.197,0.0,0.219,0.214,0.215,0.219,-0.073


In [27]:
import statsmodels.api as sm
import seaborn as sns

select = df[['60day_ret','dec_plus_inc']].copy().dropna()

X = sm.add_constant(select.iloc[:,1])
y = select.iloc[:,0]
model = sm.OLS(y, X)
result = model.fit()
print('Regreesion results:')
print(result.summary())

Regreesion results:
                            OLS Regression Results                            
Dep. Variable:              60day_ret   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                  0.040
Method:                 Least Squares   F-statistic:                     25.67
Date:                Wed, 07 Feb 2024   Prob (F-statistic):           5.41e-07
Time:                        19:03:50   Log-Likelihood:                 270.04
No. Observations:                 600   AIC:                            -536.1
Df Residuals:                     598   BIC:                            -527.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -0.0279     