Assumptions correction

In [None]:
# Number of points above the mean
m = np.sum(data > mean).values[0]
print('Number of points above the mean, m = %d' % m) 

#using count function
m = data[data > mean].count()
print('Number of points above the mean, m = %d' % m) 

In [None]:
# Create a new series with the mean subtracted from the original series to check sign
new_series = np.array(data - mean).flatten()

# Count how many times the sign changes 
runs = (np.sum(np.diff(np.sign(new_series)) != 0) + 1)
print('Number of runs = %d' % runs) #number of runs

In [None]:
#Expected number of runs
exp_runs= 2*m*(n-m)/n +1
print('Expected number of runs = %.3f' % exp_runs)

In [None]:
# Standard deviation of the number of runs
std_runs = np.sqrt((2*m*(n-m)*(2*m*(n-m)-n)/((n**2)*(n-1))))
print('Standard deviation of runs = %.03f' % std_runs)

#95% confidence interval
conf_int= stats.norm.interval(0.95, loc=exp_runs, scale=std_runs)
print('Confidence interval: (%.3f, %.3f)' % (conf_int[0], conf_int[1]))

In [None]:
# H0: data is random

alpha = 0.05 # significance level
#test statistic
z0 = (runs-exp_runs)/std_runs
z0 = z0.values[0]
print('z0 = %f' % z0)
z_alfa2= stats.norm.ppf(1-alpha/2)
print('z_alfa2 = %f' % z_alfa2)

if abs(z0)>z_alfa2:
  print('The null hypothesis is rejected')
else: 
  print('The null hypothesis is accepted')


# Remember, it is a two-tailed test, so we need to multiply the p-value by 2
p_value = 2 * (1 - stats.norm.cdf(abs(z0)))
print('p-value = %.3f' % p_value)

In [None]:
# Or with command
stat, pval_runs = runstest_1samp(data['Ex1'], correction=False)
print('Runs test statistic = {:.3f}'.format(stat))
print('Runs test p-value = {:.3f}'.format(pval_runs))

In [None]:
#Autocorrelation graph
# acf test
sgt.plot_acf(data['Ex2'], lags = int(len(data)/3), zero=False)
plt.show()

In [None]:
# Box-Cox transformation

[data_norm, lmbda] = stats.boxcox(data[''])
#data_norm = stats.boxcox(data[''], lmbda = )

print('Lambda = %.3f' % lmbda)

plt.hist(data_norm)
plt.title('Histogram of Box-Cox transformed data')
plt.show()

In [None]:
#Bartlett's test at lag 1
alpha = 0.05
lag_test = 1
rk = acf_values[lag_test]
z_alpha2 = stats.norm.ppf(1-alpha/2)
print('Test statistic rk = %f' % rk)
print('Rejection region starts at %f' % (z_alpha2/np.sqrt(n)))

if rk>z_alpha2/np.sqrt(n):
    print('The null hypothesis is rejected')
else: print('The null hypothesis is accepted')

In [None]:
lag_test = 6 # this is just an example; 

# Generally speaking: how many lags?
# Rule of thumb: L<sqrt(n)

Q0_LBQ = lbq[lag_test-1]
print('Q0_LBQ = %f' % Q0_LBQ)

#Rejection region for chi square distribution 
dof = lag_test
chi2_alfa= stats.chi2.ppf(1-alpha,dof)
print('Rejection region starts at %f' % chi2_alfa)

if Q0_LBQ>chi2_alfa:        
  print('The null hypothesis is rejected')                
else: 
  print('The null hypothesis is accepted')

# Compute the p-value for the LBQ test
pval = 1 - stats.chi2.cdf(Q0_LBQ, lag_test)
print('p-value = %f' % pval)

In [None]:
#LBQ test for autocorrelation
from statsmodels.stats.diagnostic import acorr_ljungbox

lbq_test = acorr_ljungbox(data_norm, lags=[lag_test], return_df=True)
print('LBQ test statistic at lag %d = %f' % (lag_test, lbq_test.loc[lag_test,'lb_stat']))
print('LBQ test p-value at lag %d = %f' % (lag_test, lbq_test.loc[lag_test,'lb_pvalue']))

Gapping

In [None]:
#Gapping 
gap_size= 6 # this is just an example, you can try different gapping intervals
gap_num= int(len(data)/gap_size)

gap_data= np.zeros((gap_num))
for i in range (gap_num):
    gap_data[i]=data['Ex3'][i*6]

In [None]:
# built in
# Take one data point every 6
gap_data = data['Ex3'][::gap_size]

Batching

In [None]:
# Batching
batch_size = 6
batch_num = int(len(data)/batch_size)

j=0
batch_data = np.zeros((batch_num))
for i in range (batch_num):
    batch_data[i]=np.sum(data['Ex3'][j:j+batch_size])/batch_size
    j=j+batch_size

In [None]:
# Alternative method
# Create a new column in the dataframe with the corresponding batch number
data['Batch'] = np.repeat(np.arange(1, batch_num+1), batch_size)

# Store the batch means in a new dataframe
batch_data = data.groupby('Batch').mean()

Prediction interval

In [None]:
# Normal data, want interval on future observations
alpha = 0.05
df = len(data) - 1
Xbar = data.mean()
s = data.std()
t_alpha = stats.t.ppf(1 - alpha/2, df)

[pred_lo, pred_up] = [Xbar-t_alpha*s,Xbar+t_alpha*s]
print('Two-sided prediction interval for transformed data: [%.3f %.3f]' % (pred_lo, pred_up))

In [None]:
#if data were transformed -> need to back to original
#if boxcox
[pred_lo_ORIG, pred_up_ORIG] = [(pred_lo*lmbda+1)**(1/lmbda),(pred_up*lmbda+1)**(1/lmbda)]
print('Two-sided prediction interval for original data: [%.3f %.3f]' % (pred_lo_ORIG, pred_up_ORIG))

### Models

Confidence Intervals on coeff

In [None]:
# Confidence Intervals on regression coefficients
beta1 = model.params['lag1']
print('The estimated coefficient beta1 is %.3f' % beta1)

se_beta1 = model.bse['lag1']
print('The standard error of the estimated coefficient beta1 is %.3f' % se_beta1)

alpha = 0.05
n = len(data)
t_alpha2 = stats.t.ppf(1-alpha/2, n-2)

CI_beta1 = [beta1 - t_alpha2*se_beta1, beta1 + t_alpha2*se_beta1]

print('The confidence interval for beta1 is [%.3f, %.3f]' % (CI_beta1[0], CI_beta1[1]))

In [None]:
# Or with command
CI_beta1 = model.conf_int(alpha=0.05).loc['lag1']
print('The confidence interval for beta1 is [%.3f, %.3f]' % (CI_beta1[0], CI_beta1[1]))

Prediction intervals

In [None]:
Xbar = data['lag1'].mean()          # sample mean of the regressor
S2_X = data['lag1'].var()           # sample variance of the regressor

p = len(model.model.exog_names)     # number of regressors
S2_Y = np.var(model.resid, ddof=p)  # sample variance of residuals

alpha = 0.05
t_alpha = stats.t.ppf(1-alpha/2, n-2)

In [None]:
#predict future outcomes using the regression model
last_lag = data['Ex4'].iloc[-1]
print('X_35 = %.3f' % last_lag)

#predict the next value
Yhat = model.predict([1,last_lag])
print('Next process outcome = %.3f' % Yhat)

In [None]:
# Calculate the confidence interval on mean response
CI = [Yhat - t_alpha*np.sqrt(S2_Y*(1/n + ((last_lag - Xbar)**2)/((n-1)*S2_X))),
        Yhat + t_alpha*np.sqrt(S2_Y*(1/n + ((last_lag - Xbar)**2)/((n-1)*S2_X)))]
print('The confidence interval for the mean response is [%.3f, %.3f]' % (CI[0], CI[1]))

In [None]:
# Calculate the PREDICTION interval
PI = [Yhat - t_alpha*np.sqrt(S2_Y*(1 + 1/n + ((last_lag - Xbar)**2)/((n-1)*S2_X))),
        Yhat + t_alpha*np.sqrt(S2_Y*(1 + 1/n + ((last_lag - Xbar)**2)/((n-1)*S2_X)))]

print('The prediction interval for the next value is [%.5f, %.5f]' % (PI[0], PI[1]))

In [None]:
# Plot interval
sns.regplot(x='lag1', y='value', data=data, fit_reg=True, ci=95, line_kws={'color': 'red', 'lw': 2, 'ls': '--'})
plt.title('Scatter plot of X(t) vs X(t-1)')
plt.xlabel('X(t-1)')
plt.ylabel('X(t)')
plt.grid()

In [None]:
# Or with command
prediction_df = model.get_prediction([1,last_lag]).summary_frame(alpha=0.05)
print(prediction_df)

In [None]:
# get the range of values for the regressor
x_range = np.linspace(data['lag1'].min(), data['lag1'].max(), 100)

# add a constant to the regressor
x_range = sm.add_constant(x_range)

# get the prediction interval for each value of the regressor
prediction_df = model.get_prediction(x_range).summary_frame(alpha=0.05)

# plot the data and the intervals
plt.plot(data['lag1'], data['Ex4'], 'o', color='blue', label='Original data')
plt.plot(x_range[:,1], prediction_df['mean'], '--', color='red', label='Fitted values')
plt.fill_between(x_range[:,1], prediction_df['obs_ci_lower'], prediction_df['obs_ci_upper'], color='green', alpha=0.2)
plt.fill_between(x_range[:,1], prediction_df['mean_ci_lower'], prediction_df['mean_ci_upper'], color='red', alpha=0.2)
plt.title('Scatter plot of X(t) vs X(t-1)')
plt.xlabel('X(t-1)')
plt.ylabel('X(t)')
plt.show()

Model

In [None]:
# Stepwise regression
stepwise = qda.StepwiseRegression(add_constant = True, direction = 'both', alpha_to_enter = 0.15, alpha_to_remove = 0.15)
# Fit the model
model = stepwise.fit(y, X)