In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import scipy.stats as sts
import psycopg2 as pg2

from sqlalchemy import create_engine

plt.style.use('ggplot')
%matplotlib inline

In [2]:
# Set db variables
user = 'anthony'
pw = 'pw'
host = 'localhost:5432'
db_name = 'indicator_tests'

# Create engine for interacting with db
engine = create_engine(f'postgresql+psycopg2://{user}:{pw}@{host}/{db_name}')

### Get NASDAQ ETF list :


In [3]:
nas_etf = pd.read_csv('../data/NAS_ETF_List.csv')

In [4]:
nas_etf = nas_etf['Symbol'].to_list()

In [5]:
# nas_etf[:10]

### SMA Entries

In [6]:
sma_nas = pd.read_sql_query(f'''SELECT ticker, sma_win
                                FROM sma_nasdaq
                                WHERE sma_buy = 1''',
                            con=engine)

In [14]:
# Note: some tickers are None
# sma_nas.info()

In [8]:
sma_nas.dropna(inplace=True)

In [9]:
# sma_nas.info()

In [10]:
# Remove ETFs if exist
sma_nas = sma_nas[~sma_nas.ticker.isin(nas_etf)]

In [11]:
# sma_nas.info()

In [16]:
obs = sma_nas.sma_win.count()
print(f'Total number of observations of SMA entries: {obs}')

Total number of observations of SMA entries: 2976


In [17]:
wins = sma_nas.sma_win.value_counts()[1]
losses = sma_nas.sma_win.value_counts()[0]
sma_mean = sma_nas.loc[:,"sma_win"].mean()
std = sma_nas.loc[:,"sma_win"].std()
sma_se = std / np.sqrt(wins+losses)
print(f'wins: {wins}\nlosses: {losses}\nmean: {sma_mean}\nstd: {std}\nse: {sma_se}')

wins: 859
losses: 2117
mean: 0.28864247311827956
std: 0.4532074730674296
se: 0.008307696053932029


In [15]:
# x_min = 0.26
# x_max = 0.325

# mean = sma_mean 
# std = sma_se

# x = np.linspace(x_min, x_max, wins+losses)

# y = sts.norm.pdf(x,mean,std)

# plt.plot(x,y, color='coral')

# plt.xlim(x_min,x_max)
# plt.ylim(0,50)

# plt.title('SMA Win Distribution',fontsize=22)

# plt.xlabel('Win Percentages')
# plt.ylabel('Observations')

### Random Entries

In [18]:
rnd_nas = pd.read_sql_query(f'''SELECT ticker, rnd_win
                                FROM sma_nasdaq
                                WHERE rnd_buy = 1''',
                            con=engine)

In [19]:
# rnd_nas.info()

In [20]:
# Remove ETFs if exist
rnd_nas = rnd_nas[~rnd_nas.ticker.isin(nas_etf)]

In [21]:
# rnd_nas.info()

In [23]:
obs = rnd_nas.rnd_win.count()
print(f'Total number of observations of Random entries: {obs}')

Total number of observations of Random entries: 2930


In [24]:
wins = rnd_nas.rnd_win.value_counts()[1]
losses = rnd_nas.rnd_win.value_counts()[0]
rnd_mean = rnd_nas.loc[:,"rnd_win"].mean()
std = rnd_nas.loc[:,"rnd_win"].std()
rnd_se = std / np.sqrt(wins+losses)
print(f'wins: {wins}\nlosses: {losses}\nmean: {rnd_mean}\nstd: {std}\nse: {rnd_se}')

wins: 827
losses: 2103
mean: 0.28225255972696245
std: 0.45017243125041223
se: 0.008316586040225485


In [22]:
# x_min = 0.25
# x_max = 0.315

# mean = rnd_mean 
# std = rnd_se

# x = np.linspace(x_min, x_max, wins+losses)

# y = sts.norm.pdf(x,mean,std)

# plt.plot(x,y, color='coral')

# plt.xlim(x_min,x_max)
# plt.ylim(0,50)

# plt.title('Random Win Distribution',fontsize=22)

# plt.xlabel('Win Percentages')
# plt.ylabel('Observations')

# Calculate Statistics

In [27]:
rnd_data = rnd_nas.rnd_win
sma_data = sma_nas.sma_win

t_stat, p_value = sts.ttest_ind(rnd_data, sma_data, equal_var=False)

print(f't_stat: {t_stat}\np_value: {p_value}')

t_stat: -0.543584428915917
p_value: 0.5867480026815965


In [24]:
# #Studnt, n=999, p<0.05%, Single tail
num = 3000
alpha = 0.05
t_val = sts.t.ppf(1-alpha, num)
print(f't_val: {t_val}')

t_val: 1.6453617078374079


In [28]:
alpha, beta = (.05, .05)
mu_a = rnd_data.mean()
mu_b = sma_data.mean()
std = np.sqrt(mu_a*(1-mu_a))

#ppf - percent point function, inverse of cdf
z_alpha = sts.norm.ppf(1-alpha)
z_beta = -1*sts.norm.ppf(beta)
print(f'z_alpha: {z_alpha}\nz_beta: {z_beta}')

n = np.power(((z_alpha+z_beta)*std)/(mu_a - mu_b),2)
print(f'The number of samples needed to reject the null hypothesis with \nalpha={alpha} and beta={beta} is: \n{int(n)} samples')


z_alpha: 1.6448536269514722
z_beta: 1.6448536269514729
The number of samples needed to reject the null hypothesis with 
alpha=0.05 and beta=0.05 is: 
53695 samples


In [25]:
def compute_power(null, alt, alpha):
    mu0 = null.mean()
    se0 = null.std() / np.sqrt(len(null))
    
    mua = alt.mean()
    sea = alt.std() / np.sqrt(len(alt))
    
    h0 = sts.norm(mu0, se0)
    ha = sts.norm(mua, sea)
    
    critical_value = h0.ppf(1 - alpha)
    power = 1 - ha.cdf(critical_value)
    
    return power

In [30]:
compute_power(rnd_data, sma_data, 0.05)

0.19011899503025287

In [34]:
def standard_deviation_difference_in_proportions(n1, p1, n2, p2):
    p = (n1*p1 + n2*p2) / (n1 + n2)
    return np.sqrt(p*(1-p))

In [35]:
def calc_minimum_sample_size(control_data, treatment_data, alpha, effect_size, power):
    """Calculate the minimum sample size for a z-test needed to achieve a given power.
    
    Parameters
    ----------
    
    control_data: array
      One dimensional array containing data from control group.
    
    treatment_data: array
      One dimensional array containing data from treatment group.
    
    alpha: float
      Desired significance level.  Must be between zero and one.
    
    effect_size: float
      Desired effect size to detect.
    
    power: float
      Desired power.  Must be between zero and one.
    
    Returns
    -------
    
    sample_size: int
      Minimum sample size to achieve a desired power.
    """
    standard_normal = sts.norm(0, 1)
    beta = 1 - power
    
    mu0 = control_data.mean()
    mua = treatment_data.mean()
    sigma = standard_deviation_difference_in_proportions(
        len(control_data), mu0, len(treatment_data), mua)
    
    numerator = sigma * (standard_normal.ppf(1 - alpha) - standard_normal.ppf(beta))
    denominator = mua - mu0
    return math.ceil((numerator / denominator) ** 2)

In [38]:
mu_diff = abs(rnd_data.mean()-sma_data.mean())
power = 0.95
alpha = 0.05

minsize = calc_minimum_sample_size(rnd_data, sma_data, alpha, mu_diff, power)

print(f'Minimum sample size needed to achieve {power} power and {mu_diff:.3f} effect size, with {alpha} alpha: {minsize}')

Minimum sample size needed to achieve 0.95 power and 0.006 effect size, with 0.05 alpha: 54064
