## Hyphothesis 5a: 
### The Alternative Portfolio will outperform Control Portfolio 1 and Control Portfolio 2 in 2019 for the selected thresholds: R=0.0, 0.05, 0.1, 0.15, 0.2

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from random import shuffle
import random
import seaborn as sns
import scipy
from scipy.stats import levene
from scipy.stats import ttest_ind
import scipy.stats as stats
from time import time




os.chdir('C:\\Users\\A Sua\\Documents\\FIU\\CAP2020')

In [3]:
%matplotlib notebook
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

### Import Price/Returns Data

In [4]:
df_price = pd.read_csv('data\\processed\\df_price07_20.csv')
df_return = pd.read_csv('data\\processed\\df_return7_20.csv')

df_price.rename(columns={'Unnamed: 0': 'date'}, inplace=True)
df_return.rename(columns={'Unnamed: 0': 'date'}, inplace=True)

df_price.index = pd.to_datetime(df_price['date'])
df_return.index = pd.to_datetime(df_return['date'])

df_price = df_price.drop('date', axis=1)
df_return = df_return.drop('date', axis=1)


dfsp = df_price['SPX']

df_price = df_price.drop('SPX', axis=1)
df_return = df_return.drop('SPX', axis=1)

#### RETURNS: 8/2008-12/2008: S&P

In [5]:
return_sp08 = ((dfsp[dfsp.index=='2008-12-31'].values-dfsp[dfsp.index=='2008-09-02'].values)/dfsp[dfsp.index=='2008-09-02'].values)[0]
return_sp08

-0.29299926423394224

#### RETURNS: 2019: S&P

In [6]:
return_sp19 = ((dfsp[dfsp.index=='2019-12-31'].values-dfsp[dfsp.index=='2018-12-31'].values)/dfsp[dfsp.index=='2018-12-31'].values)[0]
return_sp19

0.28878068935992623

### Company Symbols

In [7]:
companies = df_price.columns

### RETURNS: 8/2008-12/2008: PER COMPANY

In [8]:
worst_days_returns08 = {}
for symbol in companies:
    temp_end = df_price.loc['2008-12-31', symbol].values
    temp_start = df_price.loc['2008-08-29', symbol].values
    temp_ret = ((temp_end-temp_start)/temp_start)[0]
    worst_days_returns08[symbol] = temp_ret
worst_days_returns08['ORCL']

-0.19151846785225715

## $$For\thinspace each\thinspace  Threshold\thinspace  in\thinspace  the\thinspace  Return\thinspace  Thresholds\thinspace$$<br> $$H^{1}_{0}: \mu_{alt} = \mu_{ctl1} \hspace{1cm} H^{1}_{A}: \mu_{alt} > \mu_{ctl1}$$<br><br>$$H^{2}_{0}: \mu_{alt} = \mu_{ctl2} \hspace{1cm} H^{2}_{A}: \mu_{alt} > \mu_{ctl2}$$

### Run Experiment

In [30]:
r_thresholds = [0, 0.05, 0.1, 0.15, 0.20] ## threshold value compared to market
portfolio_size = [25] ## number of stocks in each portfolio
n_samples = [10000]


experiment_df = pd.DataFrame([])

#pd.DataFrame(comparison_dic, index=['sample1'])

t0 = time()
for value in r_thresholds:
    """generate/sample symbols"""
    for size in portfolio_size:
        for sample_size in n_samples:
            n_bootstraps = sample_size
            for i in range(n_bootstraps):
                comparison_dic = {}
                #symbols_dic = {}
                
                alt_list = []
                control_list1 = []
                        
                for symbol in companies:
                    temp_series = worst_days_returns08[symbol]
                    if temp_series-return_sp08>value:
                        alt_list.append(symbol)
                    else:
                        control_list1.append(symbol)
                #print('Alt pool size: {}, Ctl1 pool size {}'.format(len(alt_list), len(control_list1)))

                #print(len(alt_list),len(control_list))

#                 alt_symbols = random.sample(alt_list, size)
#                 control_symbols = random.sample(control_list, size)                
                alt_symbols = random.sample(alt_list, size)
                control_symbols1 = random.sample(control_list1, size)
                control_symbols2 = random.sample(list(companies), size)
                
                ### Add the 
                comparison_dic['alt_symbols'] = [alt_symbols]
                comparison_dic['control_symbols1'] = [control_symbols1]
                comparison_dic['control_symbols2'] = [control_symbols2]
                

                ### Generage Beginning/End Prices of the portfolios
                alt_close18 = df_price.loc[:, alt_symbols].loc['2018-12-31', :]
                alt_close19 = df_price.loc[:, alt_symbols].loc['2019-12-31', :]
                                
                ctl1_close18 = df_price.loc[:, control_symbols1].loc['2018-12-31', :]
                ctl1_close19 = df_price.loc[:, control_symbols1].loc['2019-12-31', :]
                               
                ctl2_close18 = df_price.loc[:, control_symbols2].loc['2018-12-31', :]
                ctl2_close19 = df_price.loc[:, control_symbols2].loc['2019-12-31', :]
                
                ### generate returns
                alt_return19 = (alt_close19.sum().sum()-alt_close18.sum().sum())/alt_close18.sum().sum()

                ctl1_return19 = (ctl1_close19.sum().sum()-ctl1_close18.sum().sum())/ctl1_close18.sum().sum()

                ctl2_return19 = (ctl2_close19.sum().sum()-ctl2_close18.sum().sum())/ctl2_close18.sum().sum()


                #### Add values to comparison dictionary
                comparison_dic['CTL1_19'] = ctl1_return19
                comparison_dic['CTL2_19'] = ctl2_return19
                comparison_dic['ALT_19'] = alt_return19
              
                
                comparison_dic['r_value'] = value
                comparison_dic['sample_size'] = n_bootstraps
                comparison_dic['portfolio_size'] = size
                ### Add to the dataframe
                temp_df = pd.DataFrame(comparison_dic)
                experiment_df = pd.concat([experiment_df, temp_df])
        if sample_size % 500 == 0:
            print('{} complete'.format((value, size)))
            t1 = round((time()-t0)/60, 4)
            print('{} minutes have passed'.format(t1))

        
### CAPTURE THE VARIANCE OF THE RETURNS FOR EACH OF THE SYMBOLS        
        
experiment_df.head()

(0, 25) complete
3.8064 minutes have passed
(0.05, 25) complete
7.3555 minutes have passed
(0.1, 25) complete
11.3399 minutes have passed
(0.15, 25) complete
16.1197 minutes have passed
(0.2, 25) complete
20.6392 minutes have passed


Unnamed: 0,alt_symbols,control_symbols1,control_symbols2,CTL1_19,CTL2_19,ALT_19,r_value,sample_size,portfolio_size
0,"[LLY, WEC, PGR, BKNG, BIIB, UAL, ICE, HSY, ABM...","[XEC, IDXX, PKG, NBL, SYK, LNC, BA, PLD, ZBH, ...","[INTU, K, PKI, SIVB, CHD, HCP, ADBE, SNA, D, C...",0.293906,0.218222,0.153516,0.0,10000,25
0,"[HCP, AGN, NVR, DGX, WMT, CTAS, MMM, VRTX, SHW...","[IEX, WMB, RL, PPL, KLAC, ALB, IPG, WYNN, FFIV...","[MMM, NKTR, MGM, AOS, GPS, PRU, VRTX, ABC, IFF...",0.242354,0.194175,0.497954,0.0,10000,25
0,"[RE, ABT, DRI, MO, LH, NDAQ, CHRW, KEY, AOS, A...","[ATVI, EXPE, AIG, M, DOV, COG, HST, XRX, DE, L...","[HAL, LH, UNP, CAT, FIS, TIF, HD, BAC, ECL, CS...",0.279445,0.290022,0.248544,0.0,10000,25
0,"[PRGO, K, LH, SNPS, AFL, PG, KO, JNJ, BK, AON,...","[DRE, ILMN, AIG, INTC, FE, HP, AMD, APD, ANSS,...","[JWN, INTC, MXIM, MAA, LMT, WU, MLM, COO, AJG,...",0.340542,0.377969,0.264863,0.0,10000,25
0,"[LOW, ROST, NFLX, DLTR, USB, BLL, DVA, GWW, XO...","[APH, IVZ, UNP, HIG, DE, PH, WHR, SCHW, CMI, D...","[PGR, SNPS, KEY, KMB, AEE, CMA, AAPL, CI, PVH,...",0.318161,0.306024,0.30052,0.0,10000,25


In [31]:
comparison_df = experiment_df[['CTL1_19', 'CTL2_19', 'ALT_19', 'r_value', 'sample_size', 'portfolio_size']]
comparison_df.head()

Unnamed: 0,CTL1_19,CTL2_19,ALT_19,r_value,sample_size,portfolio_size
0,0.293906,0.218222,0.153516,0.0,10000,25
0,0.242354,0.194175,0.497954,0.0,10000,25
0,0.279445,0.290022,0.248544,0.0,10000,25
0,0.340542,0.377969,0.264863,0.0,10000,25
0,0.318161,0.306024,0.30052,0.0,10000,25


In [39]:
alpha=0.05
z_critical = 1.645
hypothesis_df = pd.DataFrame([])

for r in r_thresholds:
    temp_dic = {}
    temp_df = comparison_df[comparison_df['r_value']==r]
    temp_ctl1 = temp_df['CTL1_19'] 
    temp_ctl2 = temp_df['CTL2_19'] 
    temp_alt = temp_df['ALT_19']

    ### Z - Test
    x_bar_alt = temp_alt.mean()
    x_bar_ctl1 = temp_ctl1.mean()
    x_bar_ctl2 = temp_ctl2.mean()
    std_error1 = np.sqrt(temp_alt.var()+temp_ctl1.var()-2*np.cov(temp_alt, temp_ctl1)[0][1])
    std_error2 = np.sqrt(temp_alt.var()+temp_ctl2.var()-2*np.cov(temp_alt, temp_ctl2)[0][1])    
    z_hat1 = (x_bar_alt-x_bar_ctl1-0)/std_error1
    z_hat2 = (x_bar_alt-x_bar_ctl2-0)/std_error2    
    ## Generate P-values
    p1 = 1-stats.norm.cdf(z_hat1)
    p2 = 1-stats.norm.cdf(z_hat2)
    
    ### Generate Upper/Lower Bounds
    t_lower1 = x_bar_alt-std_error1*z_critical
    t_lower2 = x_bar_alt-std_error2*z_critical    
    
    t_upper1 = x_bar_alt+std_error1*z_critical
    t_upper2 = x_bar_alt+std_error2*z_critical    

    ### Append Dataframe
    temp_dic['R-Threshold'] = r    
    temp_dic['x_bar_alt'] = x_bar_alt 
    temp_dic['lower1'] = t_lower1
    temp_dic['upper1'] = t_upper1
    temp_dic['P1A'] = p1
    temp_dic['H1A'] = ['Reject' if p1 < alpha else 'Accept']    
    temp_dic['lower2'] = t_lower2
    temp_dic['upper2'] = t_upper2
    temp_dic['P2A'] = p2
    temp_dic['H2A'] = ['Reject' if p2 < alpha else 'Accept']
    
    temp_df = pd.DataFrame(temp_dic)
    hypothesis_df = pd.concat([hypothesis_df, temp_df])

hypothesis_df.to_csv('plots\\h5a\\hypothesis_test_s10000_2.csv') 
hypothesis_df        

Unnamed: 0,R-Threshold,x_bar_alt,lower1,upper1,P1A,H1A,lower2,upper2,P2A,H2A
0,0.0,0.271457,0.1163,0.426613,0.650469,Accept,0.098538,0.444375,0.560774,Accept
0,0.05,0.272173,0.113749,0.430597,0.621812,Accept,0.096414,0.447933,0.561629,Accept
0,0.1,0.256119,0.097353,0.414886,0.677422,Accept,0.090635,0.421604,0.621661,Accept
0,0.15,0.25665,0.09905,0.41425,0.668998,Accept,0.090687,0.422614,0.62134,Accept
0,0.2,0.209276,0.067387,0.351165,0.859842,Accept,0.062263,0.356289,0.810406,Accept


In [33]:
plot_boxplots(comparison_df)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
def plot_boxplots(df):
    fig, ax = plt.subplots(figsize=(12,6))

    temp_df = df
    melted_df = pd.melt(temp_df,id_vars=['r_value', 'sample_size', 'portfolio_size'], value_vars=['CTL1_19', 'CTL2_19', 'ALT_19'],
                var_name = 'portfolio', value_name='returns')

    g = sns.catplot(ax=ax, data=melted_df, kind='box', x='r_value', y='returns', hue='portfolio', legend=True, legend_out=True)


    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.set_xlabel('')
    ax.set_ylabel('')        

    #ax.set_title('Samples={}, P_Size={}'.format(sample_size, p_size), fontsize=20)
    ax.tick_params(axis="x", rotation=20)
    ax.axhline(return_sp19, color='red', alpha=0.7, label='SP500 2019')
    ax.axhline(0, color='black', alpha=0.85, linewidth=3)
#     ax.set_ylim(-0.05, 0.05)

    legend = ax.legend(loc='upper right', bbox_to_anchor=(1.1,1), title='Portfolio', fontsize=10)
    legend.get_title().set_fontsize('12')


    fig.suptitle('Alternative vs. Control Portfolios 2019\n{:,} Iterations'.format(n_samples[0]), fontsize=25)
    fig.text(0.5, 0.04, 'R Threshold', ha='center', va='center', fontsize=15)
    fig.text(0.06, 0.5, 'Portfolio Return', ha='center', va='center', rotation=90, fontsize=15)
    fig.subplots_adjust(hspace=0.5)

    #plt.tight_layout()
    plt.close()
    plt.show()