# Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from tabulate import tabulate

from statsmodels.sandbox.stats.runs import runstest_1samp
import statsmodels.graphics.tsaplots as sgt
from statsmodels.tsa.stattools import acf
import statsmodels.api as sm
import statsmodels.tsa.api as tsa

import qda

import os
import re
from scipy import ndimage
import math
import random
import cv2
import skimage
from scipy import stats
import pylab

from PIL import Image, ImageEnhance
from skimage.measure import label, regionprops
from skimage.io import imsave
from skimage.filters import threshold_otsu

from sklearn.decomposition import PCA
from statsmodels.tsa.stattools import adfuller, kpss

def check_gaussianity(data_points, title="",plotto=1):
	if str(type(data_points)) == "<class 'pandas.core.series.Series'>":
		data_points = data_points.dropna().values
	if plotto==1:
		plt.figure(figsize=(12,4))
		plt.subplot(1, 3, 1)
		stats.probplot(data_points,plot=plt)
		plt.subplot(1, 3, 2)
		# plt.hist(data_points)
		sns.histplot(data_points, kde=True, stat="density", linewidth=0)
		plt.subplot(1, 3, 3)
		plt.plot(data_points,'o-')
		plt.suptitle(f"{title}")
		plt.show()
	_, shapiro_pvalue = stats.shapiro(data_points)
	print(f"({title}) Normality p-value = {shapiro_pvalue} - Normality at alpha 5%? {shapiro_pvalue>0.05}")
	print('H0 is "data are gaussian" => conclusion is good if pvalue is high')

def check_randomness(data_points, title="", plotto=1, layout="h"):
	if plotto==1:
		if layout=="v":
			fig, ax = plt.subplots(3,1, figsize=(8,12))
		else:
			fig, ax = plt.subplots(1,3, figsize=(12,4))
		sgt.plot_acf(data_points, lags = int(len(data_points)/3), zero=False, ax=ax[0])
		fig.subplots_adjust(hspace=0.3)
		sgt.plot_pacf(data_points, lags = int(len(data_points)/3), zero=False, ax=ax[1], method = 'ywm')
		ax[2].plot(data_points,'o-')
		plt.show()	
	_, pval_runs = runstest_1samp(data_points, correction=False)
	print(f'({title}) Runs test p-value = {pval_runs} - Random data at alpha 5%? {pval_runs>0.05}')
	print('H0 is "data are random" => conclusion is good if pvalue is high')
	#
	# and stationarity
	#
	# Perform ADF test
	adf_result = adfuller(data_points)
	adf_pvalue = adf_result[1]
	print(f"({title}) ADF test p-value = {adf_pvalue} - Stationarity at alpha 5%? {adf_pvalue <= 0.05}")
	print('H0 is "data has a unit root (non-stationary)" => conclusion is good if p-value is low')

def check_assumptions(data_points, title="",plotto=1):
	check_gaussianity(data_points, title,plotto)
	check_randomness(data_points, title,plotto)

def check_residuals(model):
	fig, axs = plt.subplots(2, 2, figsize=(12, 7))
	fig.suptitle('Residual Plots')
	stats.probplot(model.resid, dist="norm", plot=axs[0, 0])
	axs[0, 0].set_title('Normal Probability Plot')
	axs[0, 1].scatter(model.fittedvalues, model.resid)
	axs[0, 1].set_xlabel('Fitted Values')
	axs[0, 1].set_ylabel('Residuals')
	axs[0, 1].set_title('Residuals vs Fitted Values')
	plt.subplot(2, 2, 3)
	sns.histplot(model.resid, kde=True, stat="density", linewidth=0)
	plt.title('Histogram with KDE')
	axs[1, 1].plot(np.arange(1, len(model.resid) + 1), model.resid, 'o-')
	axs[1, 1].set_title('Residuals Over Time')
	_, pval_SW_res = stats.shapiro(model.resid)
	plt.tight_layout()
	plt.subplots_adjust(top=0.88, hspace=0.3) 
	check_randomness(model.resid, title="Residuals")
	check_gaussianity(model.resid, title="Residuals", plotto=0)

def get_ooc(df):
    # Filter columns that contain the word "TEST"
    test_columns = [col for col in df.columns if "TEST" in col]
    # Initialize a dictionary to store the indices and their corresponding columns with NaNs
    nan_info = {}
    # Iterate over the test columns to find NaN values
    for col in test_columns:
        nan_indices = df[col].index[df[col].notnull()].tolist()
        for idx in nan_indices:
            if idx in nan_info:
                nan_info[idx].append(col)
            else:
                nan_info[idx] = [col]
    # Print the results
    for idx, cols in nan_info.items():
        print(f'Index {idx} (ie sample {idx+1}) is ooc from column {cols}')

# Function for batching
def batch_data(data, k):
    data_batched = []
    for i in range(0, len(data), k):
        if i + k <= len(data):
            batch = data[i:i + k]
            avg = batch.mean()
            data_batched.append(avg)
    return pd.DataFrame(data_batched, columns=['x'])

# Function for gapping
def gap_data(data, k):
    data_gapped = data[::k]
    return pd.DataFrame(data_gapped, columns=['x'])

# Ex1

## A

## B

## C

## D

## E

# Ex2

## A

## B

## C

## D

## E