In [None]:
from scipy.stats import shapiro, levene, ttest_ind
from statsmodels.graphics.tsaplots import plot_acf
import random

In [None]:
# This function uses Shapiro-Wilk Test:
def check_normality_of_samples(samples, accepted_p_value=0.05):
    results = []
    for sample in samples:
        stat, p = shapiro(sample)
        results.append({
            "stat": stat,
            "p": p,
            "passes_the_test": p < accepted_p_value
        })
    return results

In [None]:
def check_equality_of_variances(samples, accepted_p_value=0.05):
    stat, p = levene(samples[0], samples[1])
    return {
        "stat": stat,
        "p": p,
        "passes_the_test": p < accepted_p_value
    }

In [None]:
def check_autocorrelation_between_samples(sample):
    plot_acf(sample)
    plt.show()

In [None]:
def ttest_ind_and_conditions(sample_1, sample_2, alternative='two-sided'):
    normality_of_samples_check = check_normality_of_samples([sample_1, sample_2])
    equality_of_variances_check = check_equality_of_variances([sample_1, sample_2])
    check_autocorrelation_between_samples(sample_2)
    
    for index, sample in enumerate(normality_of_samples_check):
        print("Normality of sample {0}? {1}".format(index,sample["passes_the_test"]))
    print("Equality of variances? {0}".format(equality_of_variances_check["passes_the_test"]))
    
    return ttest_ind(
        a = sample_1,
        b = sample_2,
        alternative="two-sided"
    )


In [None]:
def t_interval(data, column_1, column_2):
    data_filtered = data[[column_1, column_2]]
    return st.t.interval(confidence=0.95, df=len(data_filtered)-1, loc=np.mean(data_filtered), scale=st.sem(data_filtered)) 

In [None]:
def mean_by_bootstrapping(data, quantity_of_samples=40, observations_by_sample=5):
    samples_mean = []
    for i in range(quantity_of_samples):
        sample = random.sample(data.tolist(), observations_by_sample)
        mean = np.mean(sample)
        samples_mean.append(mean)
    return np.mean(samples_mean)

In [None]:
def systematic_sampling(data, step=3):
    indexes = np.arange(0, len(data), step=step)
    return data.iloc[indexes]

In [None]:
def stratified_sampling(data, column, stratified_column_values, stratified_column_values_proportions, random_state=None):
    stratified_data = pd.DataFrame(columns = [column])
    pos = -1
    
    for i in range(len(stratified_column_values)): # iteration by stratified column
        pos += 1
        if pos == len(stratified_column_values) - 1: 
            ratio_len = len(data) - len(stratified_data) # if it's the final iteration, it calculates the number of values to get the original data length
        else:
            ratio_len = int(len(data) * stratified_column_values_proportions[i]) # it calculates the number of rows according to the desired proportion

        filtered_data = data[data[column] == stratified_column_values[i]]
        iteration_sample = filtered_data.sample(replace=True, n=ratio_len, random_state=random_state)
        
        stratified_data = pd.concat([stratified_data, iteration_sample])
        
    return stratified_data # Return the stratified, re-sampled data   