<a href="https://colab.research.google.com/github/andersonmdcanteli/STAT-101-page/blob/main/STAT101_data_maker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [1]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
!pip install pycafee

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from pycafee.functions.functions import multimode
from pycafee.normalitycheck import ShapiroWilk

## Dataset

In [4]:
iris = sns.load_dataset('iris')

In [5]:
iris.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [6]:
iris.to_csv("iris_dataset.csv", index=False)

## Making center and dispersion data for each n

In [7]:
def get_ic(n, std, alfa):
  t_critico = stats.t.ppf(1-alfa/2, n-1)
  return t_critico*std/np.sqrt(n)

In [8]:
def make_data(df, name, name_list, species_name, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list):
  df_filtered = df.copy()
  for n in range(1,df_filtered.shape[0]+1):
    species_list.append(species_name)
    name_list.append(name)
    n_list.append(n)
    data = df_filtered[:n].to_numpy()
    media = data.mean()
    media_list.append(str(round(media,3)))
    median = np.median(data)
    median_list.append(str(round(median,3)))
    mode = multimode(data)
    mode_value = ""
    for key in mode.keys():
      mode_value += str(key) + " & "
    if n == 1:
      mode_list.append(str(round(media,3)))
    else:
      mode_list.append(mode_value[:-2])

    if n > 1:
      std = data.std(ddof=1)
      std_list.append(str(round(std,3)))
      ic_90 = get_ic(n, std, 0.1)
      ic_90_list.append(str(round(ic_90,3)))
      ic_95 = get_ic(n, std, 0.05)
      ic_95_list.append(str(round(ic_95,3)))
      ic_99 = get_ic(n, std, 0.01)
      ic_99_list.append(str(round(ic_99,3)))
      cv = 100*std/media
      cv_list.append(str(round(cv,2)))
    else:
      std_list.append("-")
      ic_90_list.append("-")
      ic_95_list.append("-")
      ic_99_list.append("-")    
      cv_list.append("-")

    if n > 2:
      norm_test = ShapiroWilk()
      norm_result, conclusion = norm_test.fit(data, details='binary')
      if conclusion == 0:
        conclusion = "Sim"
      else:
        conclusion = "Não"
    else:
      conclusion = "-"
    norm_list.append(conclusion)

  return name_list, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list




In [9]:
species_list = []
n_list = []
media_list = []
median_list = []
mode_list = []
std_list = []
ic_90_list = []
ic_95_list = []
ic_99_list = []
cv_list = []
norm_list = []
name_list = []

### Setosa

In [10]:
species_name = "setosa"
df_filter = iris[iris['species'] == species_name].copy()
name = 'sepal_length'
df_data = df_filter[name].copy()
name_list, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list = make_data(df_data, name, name_list, species_name, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list)

In [11]:
name = 'sepal_width'
df_data = df_filter[name].copy()
name_list, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list = make_data(df_data, name, name_list, species_name, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list)

In [12]:
name = 'petal_length'
df_data = df_filter[name].copy()
name_list, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list = make_data(df_data, name, name_list, species_name, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list)

In [13]:
name = 'petal_width'
df_data = df_filter[name].copy()
name_list, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list = make_data(df_data, name, name_list, species_name, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list)



### virginica

In [14]:
species_name = "virginica"
df_filter = iris[iris['species'] == species_name].copy()
name = 'sepal_length'
df_data = df_filter[name].copy()
name_list, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list = make_data(df_data, name, name_list, species_name, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list)

In [15]:
name = 'sepal_width'
df_data = df_filter[name].copy()
name_list, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list = make_data(df_data, name, name_list, species_name, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list)

In [16]:
name = 'petal_length'
df_data = df_filter[name].copy()
name_list, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list = make_data(df_data, name, name_list, species_name, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list)

In [17]:
name = 'petal_width'
df_data = df_filter[name].copy()
name_list, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list = make_data(df_data, name, name_list, species_name, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list)

### versicolor

In [18]:
species_name = "versicolor"
df_filter = iris[iris['species'] == species_name].copy()
name = 'sepal_length'
df_data = df_filter[name].copy()
name_list, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list = make_data(df_data, name, name_list, species_name, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list)

In [19]:
name = 'sepal_width'
df_data = df_filter[name].copy()
name_list, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list = make_data(df_data, name, name_list, species_name, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list)

In [20]:
name = 'petal_length'
df_data = df_filter[name].copy()
name_list, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list = make_data(df_data, name, name_list, species_name, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list)

In [21]:
name = 'petal_width'
df_data = df_filter[name].copy()
name_list, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list = make_data(df_data, name, name_list, species_name, species_list, n_list, media_list, median_list, mode_list, std_list, ic_90_list, ic_95_list, ic_99_list, cv_list, norm_list)

In [22]:
df = pd.DataFrame({
    "species": species_list,
    "size": n_list,
    "Média": media_list,
    "Mediana": median_list,
    "Moda": mode_list,
    "Std": std_list,
    "IC (90%)": ic_90_list,
    "IC (95%)": ic_95_list,
    "IC (99%)": ic_99_list,
    "CV (%)": cv_list,
    "Normalidade": norm_list,
    "kind": name_list,
    "data_set": "iris"

})
df.head(10)

Unnamed: 0,species,size,Média,Mediana,Moda,Std,IC (90%),IC (95%),IC (99%),CV (%),Normalidade,kind,data_set
0,setosa,1,5.1,5.1,5.1,-,-,-,-,-,-,sepal_length,iris
1,setosa,2,5.0,5.0,,0.141,0.631,1.271,6.366,2.83,-,sepal_length,iris
2,setosa,3,4.9,4.9,,0.2,0.337,0.497,1.146,4.08,Sim,sepal_length,iris
3,setosa,4,4.825,4.8,,0.222,0.261,0.353,0.648,4.6,Sim,sepal_length,iris
4,setosa,5,4.86,4.9,,0.207,0.198,0.257,0.427,4.27,Sim,sepal_length,iris
5,setosa,6,4.95,4.95,,0.288,0.237,0.302,0.474,5.82,Sim,sepal_length,iris
6,setosa,7,4.9,4.9,4.6,0.294,0.216,0.272,0.413,6.01,Sim,sepal_length,iris
7,setosa,8,4.912,4.95,4.6 & 5.0,0.275,0.184,0.23,0.34,5.59,Sim,sepal_length,iris
8,setosa,9,4.856,4.9,4.6 & 5.0,0.309,0.191,0.237,0.345,6.36,Sim,sepal_length,iris
9,setosa,10,4.86,4.9,4.9 & 4.6 & 5.0,0.291,0.169,0.208,0.299,5.99,Sim,sepal_length,iris


In [23]:
df.to_csv("iris_dataset_summary.csv", index=False)

## Animation dataset

In [24]:
def make_animation_dataset(df_data, name, name_list, species_name, species_list, size_list, valores_list, medida_list, type_list):
  df = df_data.copy()

  for n in range(1, df.shape[0]+1):

    data = df[:n].to_numpy()

    # média
    media = data.mean()
    valores_list.append(media)
    medida_list.append("Média")
    size_list.append(n)  
    name_list.append(name)
    species_list.append(species_name)
    type_list.append("central")

    # mediana
    median = np.median(data)
    valores_list.append(median)
    medida_list.append("Mediana")
    size_list.append(n)  
    name_list.append(name)
    species_list.append(species_name)
    type_list.append("central")

    # meddias que para n = 1 não tem valor
    if n == 1:
      valores_list.append(0)
      medida_list.append("Desvio Padrão")
      size_list.append(n)  
      name_list.append(name)
      species_list.append(species_name)
      type_list.append("dispersao")

      valores_list.append(0)
      medida_list.append("IC (90%)")  
      size_list.append(n)  
      name_list.append(name)    
      species_list.append(species_name)
      type_list.append("dispersao")

      valores_list.append(0)
      medida_list.append("IC (95%)")    
      size_list.append(n)  
      name_list.append(name)    
      species_list.append(species_name)
      type_list.append("dispersao")

      valores_list.append(0)
      medida_list.append("IC (99%)")    
      size_list.append(n)  
      name_list.append(name)    
      species_list.append(species_name)
      type_list.append("dispersao")

      valores_list.append(0)
      medida_list.append("CV (%)")    
      size_list.append(n)  
      name_list.append(name)    
      species_list.append(species_name)
      type_list.append("cv")

    else:
      std = data.std(ddof=1)
      valores_list.append(std)
      medida_list.append("Desvio Padrão")
      size_list.append(n)  
      name_list.append(name)
      species_list.append(species_name)
      type_list.append("dispersao")
      
      ic_90 = get_ic(n, std, 0.1)
      valores_list.append(ic_90)
      medida_list.append("IC (90%)")
      size_list.append(n)  
      name_list.append(name)
      species_list.append(species_name)
      type_list.append("dispersao")

      ic_95 = get_ic(n, std, 0.05)
      valores_list.append(ic_95)
      medida_list.append("IC (95%)")
      size_list.append(n)  
      name_list.append(name)
      species_list.append(species_name)
      type_list.append("dispersao")

      ic_99 = get_ic(n, std, 0.01)
      valores_list.append(ic_99)
      medida_list.append("IC (99%)")
      size_list.append(n)  
      name_list.append(name)
      species_list.append(species_name)
      type_list.append("dispersao")
      
      size_list.append(n)  
      name_list.append(name)
      cv = 100*std/media
      valores_list.append(cv)
      medida_list.append("CV (%)")
      species_list.append(species_name)
      type_list.append("cv")

  return name_list, species_list, size_list, valores_list, medida_list, type_list


In [25]:
name_list = []
species_list = []
size_list = []
valores_list = []
medida_list = []
type_list = []

### Setosa

In [26]:
species_name = 'setosa'
df_aux = iris[iris['species'] == species_name].copy()


name = 'sepal_length'
df_data = df_aux[name].copy()
name_list, species_list, size_list, valores_list, medida_list, type_list = make_animation_dataset(df_data, name, name_list, species_name, species_list, size_list, valores_list, medida_list, type_list)

In [27]:
name = 'sepal_width'
df_data = df_aux[name].copy()
name_list, species_list, size_list, valores_list, medida_list, type_list = make_animation_dataset(df_data, name, name_list, species_name, species_list, size_list, valores_list, medida_list, type_list)

In [28]:
name = 'petal_length'
df_data = df_aux[name].copy()
name_list, species_list, size_list, valores_list, medida_list, type_list = make_animation_dataset(df_data, name, name_list, species_name, species_list, size_list, valores_list, medida_list, type_list)

In [29]:
name = 'petal_width'
df_data = df_aux[name].copy()
name_list, species_list, size_list, valores_list, medida_list, type_list = make_animation_dataset(df_data, name, name_list, species_name, species_list, size_list, valores_list, medida_list, type_list)

### virginica

In [30]:
species_name = 'virginica'
df_aux = iris[iris['species'] == species_name].copy()


name = 'sepal_length'
df_data = df_aux[name].copy()
name_list, species_list, size_list, valores_list, medida_list, type_list = make_animation_dataset(df_data, name, name_list, species_name, species_list, size_list, valores_list, medida_list, type_list)

In [31]:
name = 'sepal_width'
df_data = df_aux[name].copy()
name_list, species_list, size_list, valores_list, medida_list, type_list = make_animation_dataset(df_data, name, name_list, species_name, species_list, size_list, valores_list, medida_list, type_list)

In [32]:
name = 'petal_length'
df_data = df_aux[name].copy()
name_list, species_list, size_list, valores_list, medida_list, type_list = make_animation_dataset(df_data, name, name_list, species_name, species_list, size_list, valores_list, medida_list, type_list)

In [33]:
name = 'petal_width'
df_data = df_aux[name].copy()
name_list, species_list, size_list, valores_list, medida_list, type_list = make_animation_dataset(df_data, name, name_list, species_name, species_list, size_list, valores_list, medida_list, type_list)

### versicolor

In [34]:
species_name = 'versicolor'
df_aux = iris[iris['species'] == species_name].copy()


name = 'sepal_length'
df_data = df_aux[name].copy()
name_list, species_list, size_list, valores_list, medida_list, type_list = make_animation_dataset(df_data, name, name_list, species_name, species_list, size_list, valores_list, medida_list, type_list)

In [35]:
name = 'sepal_width'
df_data = df_aux[name].copy()
name_list, species_list, size_list, valores_list, medida_list, type_list = make_animation_dataset(df_data, name, name_list, species_name, species_list, size_list, valores_list, medida_list, type_list)

In [36]:
name = 'petal_length'
df_data = df_aux[name].copy()
name_list, species_list, size_list, valores_list, medida_list, type_list = make_animation_dataset(df_data, name, name_list, species_name, species_list, size_list, valores_list, medida_list, type_list)

In [37]:
name = 'petal_width'
df_data = df_aux[name].copy()
name_list, species_list, size_list, valores_list, medida_list, type_list = make_animation_dataset(df_data, name, name_list, species_name, species_list, size_list, valores_list, medida_list, type_list)

In [39]:
new_df = pd.DataFrame({
    'Tamanho amostral': size_list,
    'kind': name_list,
    'Medidas': medida_list,
    "Valores": valores_list,
    "data_set": "iris",
    'species': species_list,
    "type": type_list
}) 
new_df.head()


Unnamed: 0,Tamanho amostral,kind,Medidas,Valores,data_set,species,type
0,1,sepal_length,Média,5.1,iris,setosa,central
1,1,sepal_length,Mediana,5.1,iris,setosa,central
2,1,sepal_length,Desvio Padrão,0.0,iris,setosa,dispersao
3,1,sepal_length,IC (90%),0.0,iris,setosa,dispersao
4,1,sepal_length,IC (95%),0.0,iris,setosa,dispersao


In [40]:
new_df.to_csv("iris_animation_dataset.csv", index=False)