# 1. Entrada de dados

In [125]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, datetime

from glob import glob
import math
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

#from sklearn.ensemble import IsolationForest

#import matplotlib as mpl
#import seaborn as sns
#import plotly.express as px
#import matplotlib.image as mpimg

#import statsmodels.api as sm
#from scipy.stats import norm
import pylab

In [126]:
pd.set_option('float_format', '{:.2f}'.format)
#pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 30)
#pd.set_option('display.width', 100)

In [127]:
# Ler todos os arquivos csv:
'''
Estações do INMET: 
1- Paraná - TO (A010); 
2- Imperatriz - MA (A225);
3- Bom Jesus do Piauí - PI (A326);
4- Luíz Eduardo Magalhães - BA (A404);
5- Bauru - SP (A705);
6- Casa Branca - SP (A738).

Data: De 01/01/2008 à 31/12/2022 (15 anos).
Tmax: Em graus Celsius (ºC).
Tmin: Em graus Celsius (ºC).
URmax: Umidade relativa (%).
URmin: Umidade relativa (%).
u: Velocidade do vento em metros por segundo (m/s), medido a 10 metros de altura.
Rad: Radiação solar global em Kilojoules por hora (MJ/h).
'''

files = sorted(glob(r'./Dados_Estacoes/*.csv'))
full_files = pd.concat((pd.read_csv(cont, delimiter=';') for cont in files), ignore_index=True)
full_files.head(5) # primeiras linhas
# full_files.tail(5) # últimas linhas
full_files

Unnamed: 0,Data,Hora,Prec,Rad,Tmax,Tmin,Urmax,Urmin,u
0,2008-01-01,0,0.00,-3.54,27.00,26.10,79.00,74.00,1.80
1,2008-01-01,100,0.00,-3539.00,26.40,25.50,82.00,78.00,1.60
2,2008-01-01,200,0.00,-3.54,25.70,25.30,82.00,82.00,1.40
3,2008-01-01,300,0.00,-3.54,25.40,25.20,83.00,82.00,1.30
4,2008-01-01,400,0.00,-3.54,25.30,24.80,84.00,82.00,0.60
...,...,...,...,...,...,...,...,...,...
1969483,2022-12-31,1900,0.20,577756.00,26.90,25.50,65.00,59.00,1.60
1969484,2022-12-31,2000,0.40,524284.00,26.80,26.40,65.00,60.00,0.50
1969485,2022-12-31,2100,0.20,122064.00,26.40,24.50,76.00,65.00,0.60
1969486,2022-12-31,2200,0.20,26847.00,24.90,23.10,81.00,71.00,0.10


# 2. Medidas descritivas dos dados

In [128]:
#### Conhecendo os dados ####

# full_files.info()
# full_files.shape # linhas x colunas (1.969.488, 9)
# type(full_files) # tipo de dado
# full_files.index # (start=0, stop=1.969.488, step=1)
full_files.columns # nome das colunas

Index(['Data', 'Hora', 'Prec', 'Rad', 'Tmax', 'Tmin', 'Urmax', 'Urmin', 'u'], dtype='object')

In [129]:
#### Verificar dados ausentes (quantidade absoluta) ####

# False = 0
# True = 1
full_files.isna().sum()

Data          0
Hora          0
Prec     341002
Rad      253062
Tmax     242336
Tmin     242334
Urmax    248941
Urmin    249148
u        272799
dtype: int64

In [130]:
#### Se retornar "True", tem valor ausente ####

full_files.isna().any()

Data     False
Hora     False
Prec      True
Rad       True
Tmax      True
Tmin      True
Urmax     True
Urmin     True
u         True
dtype: bool

In [131]:
#### Percentual de ausentes em ordem decrescente ####

(full_files.isna().sum() / full_files.shape[0] * 100).sort_values(ascending=False)

Prec    17.31
u       13.85
Rad     12.85
Urmin   12.65
Urmax   12.64
Tmax    12.30
Tmin    12.30
Data     0.00
Hora     0.00
dtype: float64

In [132]:
#### Quantidade de valores únicos por coluna ####

# Ex.: Hora = 24, pois temos 24h
full_files.nunique().sort_values(ascending=True)

Hora         24
Urmax        94
Urmin        94
u           118
Prec        280
Tmin        393
Tmax        399
Data       5479
Rad      774440
dtype: int64

In [133]:
#### Estatística dos dados ####

# 25%: 1º quartil
# 50%: 2º quartil (mediana)
# 75%: 3º quartil
full_files.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Hora,1969488.0,1150.0,692.22,0.0,575.0,1150.0,1725.0,2300.0
Prec,1628486.0,0.13,1.23,0.0,0.0,0.0,0.0,83.2
Rad,1716426.0,726622.68,1083173.66,-9809.0,-3.54,1536.03,1363076.25,6462592.0
Tmax,1727152.0,26.12,5.26,1.0,22.6,25.5,30.1,42.0
Tmin,1727154.0,24.69,4.96,0.4,21.6,24.2,28.1,40.7
Urmax,1720547.0,68.64,21.93,7.0,52.0,73.0,88.0,100.0
Urmin,1720340.0,62.71,22.77,7.0,44.0,65.0,83.0,100.0
u,1696689.0,1.38,1.35,0.0,0.1,1.1,2.2,16.1


In [134]:
# https://medium.com/geekculture/create-an-advanced-gantt-chart-in-python-f2608a1fd6cc 

# 3. Pré-processamento (Limpeza dos dados)

In [135]:
full_files.head(3) # primeiras linhas
# full_files.tail(3) # últimas linhas

Unnamed: 0,Data,Hora,Prec,Rad,Tmax,Tmin,Urmax,Urmin,u
0,2008-01-01,0,0.0,-3.54,27.0,26.1,79.0,74.0,1.8
1,2008-01-01,100,0.0,-3539.0,26.4,25.5,82.0,78.0,1.6
2,2008-01-01,200,0.0,-3.54,25.7,25.3,82.0,82.0,1.4


In [136]:
full_files['Data'] = pd.to_datetime(full_files['Data'], format='%Y-%m-%d').dt.strftime('%d/%m/%Y')
full_files.head(3)

Unnamed: 0,Data,Hora,Prec,Rad,Tmax,Tmin,Urmax,Urmin,u
0,01/01/2008,0,0.0,-3.54,27.0,26.1,79.0,74.0,1.8
1,01/01/2008,100,0.0,-3539.0,26.4,25.5,82.0,78.0,1.6
2,01/01/2008,200,0.0,-3.54,25.7,25.3,82.0,82.0,1.4
