# Healthcare RSI Statistical significance
## Null Hypothesis 1 - The RSI of the heathcare fund, XLV, is the same as the RSI for the S&P 500
## Alternative Hypothesis 1 - The RSI of the heathcare fund, XLV, is different from the RSI for the S&P 500

In [3]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from statsmodels.stats.power import zt_ind_solve_power

import 'Will_key'
key_alphavantage = key
# '9HXBL1MHLXSUW3XP'
function = 'RSI'
interval = 'daily'


NameError: name 'key_alphavantage' is not defined

In [None]:
symbol1 = 'XLV'

response = requests.get(f'https://www.alphavantage.co/query?function={function}&symbol={symbol1}&interval={interval}&time_period=10&series_type=open&apikey={key_alphavantage}')
response = response.json()

In [None]:
symbol2 = '.INX'

response2 = requests.get(f'https://www.alphavantage.co/query?function={function}&symbol={symbol2}&interval=daily&time_period=10&series_type=open&apikey={key_alphavantage}')
response2 = response2.json()

In [None]:
xlv_dates= list(response[f'Technical Analysis: {function}'].keys())
xlv_dates = xlv_dates[1:]

In [None]:
sp500_dates = list(response2[f'Technical Analysis: {function}'].keys())
sp500_dates = sp500_dates[1:]

In [None]:
xlv_dates_rsi = {}

count = 0
for i in xlv_dates:
    rsi = response[f'Technical Analysis: {function}'][i][f'{function}']
    count += 1
    xlv_dates_rsi[f'{i}'] = float(rsi)

In [None]:
sp500_dates_rsi = {}

count2 = 0 
for i in sp500_dates:
    rsi2 = response2[f'Technical Analysis: {function}'][i][f'{function}']
    count2 += 1
    sp500_dates_rsi[f'{i}'] = float(rsi2)

In [None]:
xlvdf = pd.DataFrame(xlv_dates_rsi.items(), columns = ['date', f'{function}'])

In [None]:
sp500df = pd.DataFrame(sp500_dates_rsi.items(), columns = ['date', f'{function}'])

In [None]:
year = [int(xlv_dates[i].split('-')[0]) for i in range(len(xlv_dates))]
month = [int(xlv_dates[i].split('-')[1]) for i in range(len(xlv_dates))]
day = [int(xlv_dates[i].split('-')[2]) for i in range(len(xlv_dates))]

In [None]:
year2 = [int(sp500_dates[i].split('-')[0]) for i in range(len(sp500_dates))]
month2 = [int(sp500_dates[i].split('-')[1]) for i in range(len(sp500_dates))]
day2 = [int(sp500_dates[i].split('-')[2]) for i in range(len(sp500_dates))]

In [None]:
xlvdf['year'] = year
xlvdf['month'] = month
xlvdf['day'] = day

In [None]:
sp500df['year'] = year2
sp500df['month'] = month2
sp500df['day'] = day2

In [None]:
xlvdf = xlvdf.loc[xlvdf['year'] >= 2017]
sp500df = sp500df.loc[sp500df['year'] >= 2017]

## Comparing the normalized RSI data for XLV and S&P500 

### 2017 Through 2019 YTD
Taking samples and calculating the mean

In [None]:
xlv_sample_means = []
sp500_sample_means = []
for i in range(10000):
    xlv_sample = xlvdf.sample(n=50, random_state=i) 
    xlv_sample_means.append(xlv_sample.RSI.mean()) 
        
    sp500_sample = sp500df.sample(n=50, random_state=i) 
    sp500_sample_means.append(sp500_sample.RSI.mean()) 

### Evaluating z - score and statistical significance

In [None]:
xlv_bar = xlvdf.RSI.mean()  ### xlv population mean (subset)
sp500_bar = sp500df.RSI.mean() ### S&P500 population mean (universal population)

sp500_stan_dev = np.std(sp500df.RSI) ### S&P500 population standard dev
sp500_stan_err = sp500_stan_dev/np.sqrt(len(sp500df))
xlv_z_score = (xlv_bar - sp500_bar) / (sp500_stan_dev/np.sqrt(len(sp500df)))

print(f'The z_score is: {xlv_z_score}')
xlv_p_val = stats.norm.cdf(xlv_z_score)

print(f'The p-value is: {xlv_p_val}')
print(f'The standard deviation is {sp500_stan_dev}')
print(f'The standard error is {sp500_stan_err}')   ### Z_score * Standard error = real difference between sample and pop means

The p-value is extremely low >> .05, so we can reject our null-hypothesis that the RSI for the S&P500 and XLV are the same.

### Plotting Histogram Visual 

In [None]:
plt.hist(xlv_sample_means, bins=500, color = 'blue', label='XLV 2017-19');
plt.hist(sp500_sample_means, bins=500, color = 'red', label='S&P500 2017-19');
plt.title('RSI Histogram')
plt.legend();

## Cohen's D and Power - How significant is the Difference?

In [None]:
def cohen_d(xlv, sp500):

    diff_mean = xlv.mean() - sp500.mean()

    n_xlv = len(xlv)
    n_sp500 = len(sp500)
    var_xlv = xlv.var()
    var_sp500 = sp500.var()

    pooled_var = (n_xlv * var_xlv + n_sp500 * var_sp500) / (n_xlv + n_sp500)

    cod = diff_mean / np.sqrt(pooled_var)

    return cod

In [None]:
xlv_cod = cohen_d(xlvdf['RSI'], sp500df['RSI'])

In [None]:
n1_xlv = len(xlvdf['RSI'])
n2_sp500 = len(sp500df['RSI'])
ratio = n1_xlv/n2_sp500
xlv_power = zt_ind_solve_power(effect_size = xlv_cod, nobs1=n1_xlv, power = None, alpha = .05) 
print(f'Power= {xlv_power}\nCohen D= {xlv_cod}')

In [None]:
xlvdf['RSI'].mean() - sp500df['RSI'].mean()

It can be inferred that there is a difference, but since power is less than .8, it is less significant. This can be attributed to the small Cohen's D value, meaning there is a small effect on different populations

### Null Hypothesis 2 - The RSI of Johnson & Johnson, the largest holdings in the XLV, mirrors the RSI of the XLV
### Alternative Hypothesis 2 - The RSI of Johnson & Johnson, the largest holdings in the XLV, is different from the RSI of the XLV

In [None]:
symbol_list = ['JNJ'] 

In [None]:
all_list = []
for symbol in symbol_list:
    response = requests.get(f'https://www.alphavantage.co/query?function={function}&symbol={symbol}&interval={interval}&time_period=10&series_type=open&apikey={key_alphavantage}')
    jason = response.json()
    all_list.append(jason)

In [None]:
dictjnj = {}

len_jnj = len(all_list[0]['Technical Analysis: RSI'])
for i in range(len_jnj):
    date = list(all_list[0]['Technical Analysis: RSI'].keys())[i]
    rsi = float(list(all_list[0]['Technical Analysis: RSI'].values())[i]['RSI'])
    dictjnj[f'{date}'] = rsi
df_jnj = pd.DataFrame(dictjnj.items(), columns = ['date', 'rsi'])

year = []
for i in range(len_jnj):
    year.append(int(df_jnj['date'][i].split('-')[0]))

In [None]:
df_jnj['year'] = year
df_jnj.drop(0, inplace = True)

In [None]:
df_jnj = df_jnj.loc[df_jnj['year'] >= 2017] #2017 and newer data

### Taking samples, and evaluating the test statistics.

In [None]:
jnj_sample_means = []

for i in range(10000):
    jnj_sample = df_jnj.sample(n=50, random_state=i) 
    jnj_sample_means.append(jnj_sample.rsi.mean()) 

In [None]:
jnj_bar= df_jnj.rsi.mean()  ### johnson and johnson population mean (subset)

xlv_stan_dev = np.std(xlvdf.RSI)
xlv_stan_err = xlv_stan_dev/np.sqrt(len(xlvdf))

jnj_z_score = (jnj_bar - xlv_bar) / (xlv_stan_dev/np.sqrt(len(xlvdf)))
jnj_p_val = stats.norm.cdf(jnj_z_score)
p_valjnj = stats.norm.cdf(jnj_z_score)

print(f'The z_score is: {jnj_z_score}')
print(f'The p-value is: {jnj_p_val}')
print(f'The standard deviation is {xlv_stan_dev}')
print(f'The standard error is {xlv_stan_err}')

The p-value is much less than .05, so we can reject our null-hypothesis that Johnson & Johnson and XLV have the same RSI 

In [None]:
plt.hist(jnj_sample_means, bins=500, color='blue', label='JNJ 2017-2019');
plt.hist(xlv_sample_means, bins=500, color='red', label='XLV 2017-2019');
plt.title('RSI 2017 - 2019')
plt.legend();

In [None]:
print(jnj_z_score * xlv_stan_err)
print(jnj_bar - xlv_bar)

### Cohen's D for Johnson & Johnson and XLV - 2017-2019

In [None]:
codjnj = cohen_d(df_jnj['rsi'], xlvdf['RSI'])

In [None]:
n1_jnj = len(df_jnj['rsi'])
n2_xlv = len(xlvdf['RSI'])
ratio = n1_jnj/n2_xlv
jnj_power = zt_ind_solve_power(effect_size = codjnj, nobs1=n1_jnj, power = None, alpha = .05) 
print(f'Power= {jnj_power}\nCohen D= {codjnj}')

In contrast to the prior power analysis between XLV and S&P500, the power is greater than .8, in spite of the relativly small Cohen's D. Meaning that there is a decisive difference in the two groups, although the effect size is small.