# Simulação do intervalo de confiança do coeficiente de correlação ( $r_{pearson}$ )



[![](https://markdown-videos.deta.dev/youtube/gc4f-Wrx4zw)](https://youtu.be/gc4f-Wrx4zw)


***O vídeo esta acelerado em 16 vezes.***

O vídeo acima demonstra o comportamento do coeficiente de correlação em relação ao tamanho da amostra e considerando o intervalo de confiança. Ele foi desenvolvido para ser utilizado em aulas para exemplificar o comportamento do coeficiente.


## Considerações

- Admite-se que os dados sejam normais, mesmo as amostras com tamanho amostral pequeno;
- Os dados são amostrados de um único conjunto de dados;
- Cada amostragem é aleatória e é sempre obtida do conjunto de dados original;
- Não é possível repetir exatamente os mesmos resultados devido à randomização;


> Com a configuração padrão, são gerados 4100 gráficos e 1 vídeo, que juntos ocupam quase 1 GB de memória


Para mais detalhes sobre os cálculos, conjunto de dados e detalhes, [loading...]().

## Imports

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import os
import cv2
from sklearn.model_selection import train_test_split

## Dataset

In [None]:
nasal_length = np.array([609, 629, 620, 564, 645, 493, 606, 660, 630, 672, 778, 616, 727, 810, 778, 823, 755, 710, 701,
                         803, 855, 838, 830, 864, 635, 565, 562, 580, 596, 597, 636, 559, 615, 740, 677, 675, 629, 692,
                         710, 730, 763, 686, 717, 737, 816])

nasal_width = np.array([241, 222, 233, 207, 247, 189, 226, 240, 215, 231, 263, 220, 271, 284, 279, 272, 268, 278, 238,
                        255, 308, 281, 288, 306, 236, 204, 216, 225, 220, 219, 201, 213, 228, 234, 237, 217, 211, 238,
                        221, 281, 292, 251, 231, 275, 275])

## Options

### Plot

In [None]:
plt.rcParams["font.family"] = "Arial" 
plt.rcParams["font.size"] = 11
box_text = "Aleatorização"
x_label = "Tamanho amostral ($n$)"
y_label = "Coeficiente de correlação ($r_{pearson}$)"

### Graph and Video

In [None]:
video_name = "video-interval" # name of the video file that will be generated
directory = 'r-interval-images' # directory to save the figures

### Other

In [None]:
alpha = 0.05 # level of significance
N_FULL_ITERATIONS = 100 # Number of iterations performed by varying the sample size --> must be int

## Functions

In [None]:
def pearson_interval(x, y, alpha=0.05):
    z_critical = stats.norm.ppf(1 - alpha/2)
    r_pearson, _ = stats.pearsonr(x, y)
    r_pearson_z_scale = np.arctanh(r_pearson)
    ic_z_scale = z_critical/np.sqrt(len(x)-3)
    ic_lower_z_scale = r_pearson_z_scale - ic_z_scale
    ic_upper_z_scale = r_pearson_z_scale + ic_z_scale
    ic_lower = np.tanh(ic_lower_z_scale)
    ic_upper = np.tanh(ic_upper_z_scale)    
    
    return ic_lower, r_pearson, ic_upper, alpha

## Setting up

In [None]:
current_directory = os.getcwd()
path_images = current_directory + "\\" + directory
os.makedirs(path_images)

## Creating the charts

In [None]:
# lists to append every lower, r and upper value
lower_full = []
upper_full = []
center_full = []

# initializing variable to order the graphics
aux = 0

# iterating through j times the entire range of data
for j in range(N_FULL_ITERATIONS):
    # lists to append lower, r, and upper values for each j 
    lower = []
    center = []
    upper = []

    # defining the minimum number of points to be used
    n_min = 5

    # creating a list with size varying between n_min and the size of the dataset
    data = list(range(n_min, nasal_length.size + 1))

    # iterating on every n
    for i in data:       
        aux = aux + 1 
        ## getting n_min samples
        
        # if i is equal to the total number of samples, do not sample and calculate r with all data
        if i == nasal_length.size:
            ic_lower, r_pearson, ic_upper, alpha = pearson_interval(x=nasal_length, y=nasal_width)
        
        # if i is less than the total number of samples, separate a random sample with i samples
        else:
            x_data, _, y_data, _ = train_test_split(nasal_length, nasal_width, train_size=i)
            ic_lower, r_pearson, ic_upper, alpha = pearson_interval(x=x_data, y=y_data)
        
        # appending the results to the intermediate list
        lower.append(ic_lower)
        center.append(r_pearson)
        upper.append(ic_upper)
        
        ## plotting the graph
        # If it is the first point, plot only one scatter plot
        if i == data[0]:
            # creating the canvas
            plt.figure(figsize=(16,6))
            # checking if a complete iteration has already been done.
            if len(lower_full) == 0:
                # If not, do nothing (keeping it just for readability)
                pass
            else:
                 # Otherwise, plot the traces already made
                for l, c, u in zip(lower_full, center_full, upper_full):
                    # line
                    plt.plot(range(5, 5 + len(l)), c, color='k', alpha=0.05)
                    # area
                    plt.fill_between(range(5, 5 + len(l)), l, u, alpha=0.05, color='r', edgecolor='none')
            
            # adding the first point
            plt.scatter(i, center, marker=".", color='k', alpha=0.05, label="$r_pearson$")
            plt.scatter(i, upper, marker=".", color='r', alpha=0.05, label="Intervalo de confiança")
            plt.scatter(i, lower, marker=".", color='r', alpha=0.05)
            
            # some adjustments and export the plot
            plt.xticks(np.linspace(0,50,11))
            plt.xlim(0,50)
            plt.ylim(-1.1,1.1)
            plt.ylabel(y_label)
            plt.xlabel(x_label)            
            plt.annotate(f"{box_text}: {aux}", xy=(30, -0.85), bbox=dict(boxstyle='square,pad=1', fc="w"), size=20)            
            plt.savefig(f"{directory}/{aux}.png", dpi=100,)
            plt.show() 
        else:
            # If it is not the first, plot the line and fill
            # creating the canvas
            plt.figure(figsize=(16,6))
            # checking if a complete iteration has already been done.            
            if len(lower_full) == 0:
                # If not, do nothing (keeping it just for readability)              
                pass
            else:
                 # Otherwise, plot the traces already made               
                for l, c, u in zip(lower_full, center_full, upper_full):
                    # line
                    plt.plot(range(5, 5 + len(l)), c, color='k', alpha=0.05)
                    # area
                    plt.fill_between(range(5, 5 + len(l)), l, u, alpha=0.05, color='r', edgecolor='none')            
            
            # adding the new points
            plt.fill_between(data[:i-n_min+1], lower, upper, alpha=0.05, color='r', edgecolor='none', label="Intervalo de confiança")
            plt.plot(data[:i-n_min+1], center, color='k', alpha=0.05, label="$r_pearson$")

            # some adjustments and export the plot
            plt.xticks(np.linspace(0,50,11))
            plt.xlim(0,50)
            plt.ylim(-1.1,1.1)
            plt.ylabel(y_label")
            plt.xlabel(x_label)    
            plt.annotate(f"{box_text}: {aux}", xy=(30, -0.85), bbox=dict(boxstyle='square,pad=1', fc="w"), size=20)            
            plt.savefig(f"{directory}/{aux}.png", dpi=100,)
            plt.close()  
    
    # appending to the outer list
    lower_full.append(lower)
    upper_full.append(upper)
    center_full.append(center)

## Creating the video with the generated graphics

In [None]:
# creating an ordered list with file names
img_names = []
for i in range(1,aux+1):
    img_names.append(f"{i}.png")
    
# reading the first image to get information
frame = cv2.imread(os.path.join(path_images, img_names[0]))
height, width, layers = frame.shape

# created the video instance
video = cv2.VideoWriter(f'{video_name}.mp4', 0, 2, (width,height))

# making the video
for image in img_names:
    video.write(cv2.imread(os.path.join(directory, image)))

# Finishing
cv2.destroyAllWindows()
video.release()