## Etapa: 04. Balanceamento

#### Código criado por: Eduardo Arthur Bitencourt  | Data: 02/08/2023
##### Github: https://github.com/bitencourt-eduardo  | Linkedin: https://www.linkedin.com/in/bitencourt-eduardo/

In [47]:
#!pip install matplotlib
#!pip install imbalanced-learn

In [48]:
#Importar Bibliotecas

import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import BorderlineSMOTE

#from imblearn.under_sampling import TomekLinks
#from imblearn.over_sampling import ADASYN
#from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

#### Carregando arquivos FieldPRO

Sensor_FieldPRO, dados transmitidos pelo sensor

In [49]:
# Carregar arquivo de dados transmitidos pelo sensor
name_file = 'df_normalized.csv'
diretorio = '../datasets/{name_file}'

df = pd.read_csv(diretorio.format(name_file=name_file), sep=';', decimal='.')
df.head()

Unnamed: 0,air_humidity_100,atm_pressure_main,num_of_resets,piezo_charge,piezo_temperature,chuva
0,0.290698,0.394904,0.0,0.579894,0.527778,0.0
1,0.302326,0.43949,0.0,0.575778,0.555556,0.0
2,0.302326,0.43949,0.0,0.57376,0.555556,0.0
3,0.290698,0.420382,0.0,0.571764,0.555556,0.0
4,0.302326,0.388535,0.0,0.569788,0.527778,0.0


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1525 entries, 0 to 1524
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   air_humidity_100   1525 non-null   float64
 1   atm_pressure_main  1525 non-null   float64
 2   num_of_resets      1525 non-null   float64
 3   piezo_charge       1525 non-null   float64
 4   piezo_temperature  1525 non-null   float64
 5   chuva              1525 non-null   float64
dtypes: float64(6)
memory usage: 71.6 KB


___________________________________________________
Corrigindo formato time-stamp do campo data

In [51]:
df.describe()

Unnamed: 0,air_humidity_100,atm_pressure_main,num_of_resets,piezo_charge,piezo_temperature,chuva
count,1525.0,1525.0,1525.0,1525.0,1525.0,1525.0
mean,0.531445,0.525342,0.498361,0.560748,0.389672,0.005558
std,0.255045,0.173707,0.346967,0.260398,0.236744,0.049362
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.313953,0.414013,0.333333,0.365375,0.194444,0.0
50%,0.534884,0.528662,0.333333,0.55612,0.305556,0.0
75%,0.755814,0.643312,0.666667,0.783175,0.583333,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


_____________________________________________________________________________________________________________________________

# Verificando Balanciamento

In [52]:
# Contagem de ocorrências de cada valor da classe "chuva"
contagem_classes = df["chuva"].value_counts()
proporcao_classes = (contagem_classes / len(df))*100
pd.DataFrame({"Contagem": contagem_classes, "Proporção": proporcao_classes})

Unnamed: 0_level_0,Contagem,Proporção
chuva,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1470,96.393443
0.011905,14,0.918033
0.035714,5,0.327869
0.02381,4,0.262295
0.047619,4,0.262295
0.071429,3,0.196721
0.547619,2,0.131148
0.095238,2,0.131148
0.130952,2,0.131148
0.404762,2,0.131148


In [53]:
len(contagem_classes)

27

Observado:
* 96% dos dados são de Não chuva
* < 4% dos dados estão distribuidos em 27 classes

Conclu-se dados desbalanceados e sugerido realizar as alternativas abaixo e testar melhor modelo.

1. Oversampling
2. Undersampling
3. Geração de Dados Sintéticos


# Realizando Oversampling

In [54]:
# Separar os recursos da variável alvo (chuva)
X = df.drop(columns=["chuva"])
y = df["chuva"]

# Discretizar a variável "chuva" em 27 classes
num_classes_chuva = 100
y_discretized = pd.cut(y, bins=num_classes_chuva, labels=False)

# Criar o objeto RandomOverSampler
oversampler = RandomOverSampler(sampling_strategy="auto", random_state=42)

# Aplicar o oversampling ao conjunto de treinamento
X_oversampled, y_oversampled = oversampler.fit_resample(X, y_discretized)

# Criar um novo DataFrame com os dados resampleados e a variável "chuva" convertida de volta para contínua
df_oversampling = pd.DataFrame(X_oversampled, columns=X.columns)
df_oversampling["chuva"] = y_oversampled

In [55]:
df_oversampling.head()

Unnamed: 0,air_humidity_100,atm_pressure_main,num_of_resets,piezo_charge,piezo_temperature,chuva
0,0.290698,0.394904,0.0,0.579894,0.527778,0
1,0.302326,0.43949,0.0,0.575778,0.555556,0
2,0.302326,0.43949,0.0,0.57376,0.555556,0
3,0.290698,0.420382,0.0,0.571764,0.555556,0
4,0.302326,0.388535,0.0,0.569788,0.527778,0


In [56]:
df_oversampling.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39690 entries, 0 to 39689
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   air_humidity_100   39690 non-null  float64
 1   atm_pressure_main  39690 non-null  float64
 2   num_of_resets      39690 non-null  float64
 3   piezo_charge       39690 non-null  float64
 4   piezo_temperature  39690 non-null  float64
 5   chuva              39690 non-null  int64  
dtypes: float64(5), int64(1)
memory usage: 1.8 MB


In [57]:
# Contagem de ocorrências de cada valor da classe "chuva"
contagem_classes = df_oversampling["chuva"].value_counts()
proporcao_classes = (contagem_classes / len(df_oversampling))*100
pd.DataFrame({"Contagem": contagem_classes, "Proporção": proporcao_classes})

Unnamed: 0_level_0,Contagem,Proporção
chuva,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1470,3.703704
16,1470,3.703704
90,1470,3.703704
5,1470,3.703704
27,1470,3.703704
22,1470,3.703704
36,1470,3.703704
40,1470,3.703704
11,1470,3.703704
20,1470,3.703704


In [58]:
len(contagem_classes)

27

In [59]:
df_oversampling.to_csv('../datasets/df_oversampling.csv', sep = ';', index=False)

# Realizando Undersampling

In [60]:
# Criar o objeto RandomUnderSampler
undersampler = RandomUnderSampler(sampling_strategy="auto", random_state=42)

# Aplicar o undersampling ao conjunto de treinamento resampleado
X_undersampled, y_undersampled = undersampler.fit_resample(X, y_discretized)

# Criar um novo DataFrame com os dados undersampleados e a variável "chuva" convertida de volta para contínua
df_undersampling = pd.DataFrame(X_undersampled, columns=X.columns)
df_undersampling["chuva"] = y_undersampled

In [61]:
df_undersampling.head(30)

Unnamed: 0,air_humidity_100,atm_pressure_main,num_of_resets,piezo_charge,piezo_temperature,chuva
1068,0.616279,0.414013,0.666667,0.563304,0.305556,0
1157,0.94186,0.210191,0.666667,0.37721,0.222222,1
332,0.906977,0.649682,0.333333,0.916535,0.166667,2
1518,0.895349,0.375796,1.0,0.374246,0.222222,3
1155,0.906977,0.22293,0.666667,0.385485,0.25,4
1383,0.918605,0.719745,1.0,0.629109,0.194444,5
539,0.802326,0.433121,0.333333,0.572937,0.277778,7
1515,0.895349,0.44586,1.0,0.378836,0.194444,9
674,0.918605,0.43949,0.333333,0.348311,0.194444,10
1153,0.918605,0.33758,0.666667,0.389046,0.25,11


In [62]:
df_undersampling.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27 entries, 1068 to 1149
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   air_humidity_100   27 non-null     float64
 1   atm_pressure_main  27 non-null     float64
 2   num_of_resets      27 non-null     float64
 3   piezo_charge       27 non-null     float64
 4   piezo_temperature  27 non-null     float64
 5   chuva              27 non-null     int64  
dtypes: float64(5), int64(1)
memory usage: 1.5 KB


In [63]:
# Contagem de ocorrências de cada valor da classe "chuva"
contagem_classes = df_undersampling["chuva"].value_counts()
proporcao_classes = (contagem_classes / len(df_undersampling))*100
pd.DataFrame({"Contagem": contagem_classes, "Proporção": proporcao_classes})

Unnamed: 0_level_0,Contagem,Proporção
chuva,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,3.703704
20,1,3.703704
90,1,3.703704
54,1,3.703704
48,1,3.703704
40,1,3.703704
36,1,3.703704
33,1,3.703704
27,1,3.703704
24,1,3.703704


In [64]:
df_undersampling.to_csv('../datasets/df_undersampling.csv', sep = ';', index=False)

# Geração de dados sintéticos

In [65]:
# Criar o objeto BorderlineSMOTE
borderline_smote = BorderlineSMOTE(sampling_strategy="auto", random_state=42)

# Aplicar o Borderline-SMOTE ao conjunto de treinamento normalizado
X_bsmote, y_bsmote = borderline_smote.fit_resample(X, y_discretized)

# Criar um novo DataFrame com os dados resampleados e a variável "chuva" convertida de volta para contínua
df_bsmote = pd.DataFrame(X_bsmote, columns=X.columns)
df_bsmote["chuva"] = y_bsmote

In [66]:
# Contagem de ocorrências de cada valor da classe "chuva"
contagem_classes = df_bsmote["chuva"].value_counts()
proporcao_classes = (contagem_classes / len(df_bsmote))*100
pd.DataFrame({"Contagem": contagem_classes, "Proporção": proporcao_classes})

Unnamed: 0_level_0,Contagem,Proporção
chuva,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1470,49.312311
1,1470,49.312311
3,5,0.167729
2,4,0.134183
4,4,0.134183
7,3,0.100637
54,2,0.067092
9,2,0.067092
13,2,0.067092
40,2,0.067092


In [67]:
df_bsmote.to_csv('../datasets/df_bsmote.csv', sep = ';', index=False)