# **Preparação dos Dados (Data Preparation)**

Equipe:
* Beatriz Andrade de Miranda - bam2@cin.ufpe.br
* Camila Siqueira Lins - csl2@cin.ufpe.br
* Luisa Cavalcante - lncc@cin.ufpe.br
* Nicolly Lira Albuquerque - nla@cin.ufpe.br


# Importações e configurações

In [1]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

#SMOTE
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

#Centoides
from sklearn.cluster import KMeans
from imblearn.under_sampling import RandomUnderSampler

#Base de dados

In [3]:
statlog_shuttle = fetch_ucirepo(id=148)
X = statlog_shuttle.data.features
y = statlog_shuttle.data.targets
df = pd.DataFrame(X)
df['class'] = y

In [4]:
print(f"Número de amostras: {df.shape[0]}")
print(f"Número de atributos (incluindo o alvo): {df.shape[1]}")

Número de amostras: 58000
Número de atributos (incluindo o alvo): 8


In [5]:
original_df = df.copy()

In [6]:
df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,45586
4,8903
5,3267
3,171
2,50
7,13
6,10


# 1. Remoção de Duplicatas

Existem 16.354 registros duplicados que podem distorcer a previsão de classificadores por reduzir a variância dos dados de suas respectivas classes.

In [7]:
def remove_duplicates(df):
    # 1. Remoção de Duplicatas
  return  df.drop_duplicates()


In [8]:
print("Registros antes de remover duplicados ")
print(len(df))
df = remove_duplicates(df)
print("Registros depois de remover duplicados ")
print(len(df))
df['class'].value_counts()

Registros antes de remover duplicados 
58000
Registros depois de remover duplicados 
41646


Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,31535
4,7394
5,2524
3,132
2,40
7,11
6,10



## 2. Tratamento de Outliers da classe majoritária

Variáveis como Fpv Close, High, e Bpv Open apresentam um número significativo de outliers que podem ser ruídos ou indicativos de anomalias importantes. Os intervalos interquartís (IQR) foram observados para identificar valores extremos.


In [9]:
df['class'] = df['class'].replace({1:'Rad Flow',2:'Fpv Close',3:'Fpv Open', 4:'High',5:'Bypass',6:'Bpv Close',7:'Bpv Open'} )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['class'] = df['class'].replace({1:'Rad Flow',2:'Fpv Close',3:'Fpv Open', 4:'High',5:'Bypass',6:'Bpv Close',7:'Bpv Open'} )


In [10]:

def remove_outliers_iqr(data, column, multiplier=1.5):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

In [11]:
def treat_outliers(df):
    numeric_columns = df.select_dtypes(include=np.number).columns
    df_rad_flow = df[df['class'] == 'Rad Flow']
    df_fpv_close = df[df['class'] == 'Fpv Close']
    df_fpv_open = df[df['class'] == 'Fpv Open']
    df_high = df[df['class'] == 'High']
    df_bypass = df[df['class'] == 'Bypass']
    df_bpv_close = df[df['class'] == 'Bpv Close']
    df_bpv_open = df[df['class'] == 'Bpv Open']
    print(f"Tamanho inicial do DataFrame radflow: {len(df_rad_flow)}")
    print(f"Tamanho inicial do DataFrame high: {len(df_high)}")
    print(f"Tamanho inicial do DataFrame bypass: {len(df_bypass)}")
    initial_size = len(df_rad_flow)

    method = 'iqr'
    multiplier = 2.5
    percentile_threshold = 2.5

    for col in numeric_columns:
        if method == 'iqr':
            df_rad_flow = remove_outliers_iqr(df_rad_flow, col, multiplier=multiplier)
            df_high = remove_outliers_iqr(df_high, col, multiplier=multiplier)
            df_bypass = remove_outliers_iqr(df_bypass, col, multiplier=multiplier)

        else:
            raise ValueError("Método inválido. Escolha entre 'iqr' ou 'percentile'.")

    print(f"Tamanho final do DataFrame radflow após tratamento de outliers: {len(df_rad_flow)}")
    print(f"Tamanho final do DataFrame high após tratamento de outliers: {len(df_high)}")
    print(f"Tamanho final do DataFrame bypass após tratamento de outliers: {len(df_bypass)}")

    final_size = len(df_rad_flow)
    removed_percentage = ((initial_size - final_size) / initial_size) * 100
    print(f"Percentual de registros removidos: {removed_percentage}%")

    dfs = [df_rad_flow, df_high, df_bypass, df_fpv_open, df_fpv_close, df_bpv_open, df_bpv_close]

    # Unindo todos os DataFrames em um único
    df_combined = pd.concat(dfs, axis=0, ignore_index=True)
    print(len(df_combined))

    return df_combined






In [12]:
df_wo_outliers = treat_outliers(df)



Tamanho inicial do DataFrame radflow: 31535
Tamanho inicial do DataFrame high: 7394
Tamanho inicial do DataFrame bypass: 2524
Tamanho final do DataFrame radflow após tratamento de outliers: 17491
Tamanho final do DataFrame high após tratamento de outliers: 4702
Tamanho final do DataFrame bypass após tratamento de outliers: 1835
Percentual de registros removidos: 44.534644046297764%
24221


In [13]:
df_wo_outliers['class'].value_counts()


Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
Rad Flow,17491
High,4702
Bypass,1835
Fpv Open,132
Fpv Close,40
Bpv Open,11
Bpv Close,10


##3. Separação de conjuntos de treino e teste

In [14]:
len(df_wo_outliers)

24221

In [15]:
df_wo_outliers['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
Rad Flow,17491
High,4702
Bypass,1835
Fpv Open,132
Fpv Close,40
Bpv Open,11
Bpv Close,10


In [16]:
X = df_wo_outliers.drop('class', axis=1)
y = df_wo_outliers['class']

In [17]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [18]:
print("Tamanho base de treino:"+ str(len(X_train)))
print("Tamanho base de validação:"+ str(len(X_val)))
print("Tamanho base de teste:"+ str(len(X_test)))

Tamanho base de treino:14532
Tamanho base de validação:4844
Tamanho base de teste:4845


In [19]:
df_train = pd.concat([X_train, y_train], axis=1)

In [20]:
df_val = pd.concat([X_val, y_val], axis=1)

In [21]:
df_test = pd.concat([X_test, y_test], axis=1)

## 4. Balancear dataset

In [22]:

def balance_dataset(df, target_samples=1800, smote_neighbors=5, random_state=42):
    """
    Balanceia um dataset com uma combinação de oversampling (SMOTE) e undersampling.

    Parâmetros:
    - X: Dados de entrada (numpy array ou DataFrame)
    - y: Rótulos correspondentes (numpy array ou Series)
    - target_samples: Número de amostras desejadas por classe
    - smote_neighbors: Número de vizinhos usados pelo SMOTE
    - random_state: Semente para reprodução de resultados

    Retorna:
    - X_balanced: Dados balanceados
    - y_balanced: Rótulos balanceados
    """
    X = df.drop('class', axis=1)
    y = df['class']
    # Contar as instâncias de cada classe
    class_counts = Counter(y)

    # Definir estratégias de SMOTE e undersampling
    smote_strategy = {cls: target_samples for cls, count in class_counts.items() if count < target_samples}
    undersample_strategy = {cls: target_samples for cls, count in class_counts.items() if count > target_samples}

    # Inicializar SMOTE e RandomUnderSampler com as estratégias calculadas
    smote = SMOTE(sampling_strategy=smote_strategy, k_neighbors=smote_neighbors, random_state=random_state)
    undersample = RandomUnderSampler(sampling_strategy=undersample_strategy, random_state=random_state)

    # Pipeline para aplicar SMOTE e depois undersampling
    pipeline = Pipeline(steps=[('smote', smote), ('undersample', undersample)])

    # Aplicar o pipeline
    X_balanced, y_balanced = pipeline.fit_resample(X, y)

    return X_balanced, y_balanced




In [23]:
df_train['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
Rad Flow,10492
High,2819
Bypass,1102
Fpv Open,77
Fpv Close,29
Bpv Open,7
Bpv Close,6


In [24]:
# Exemplo de uso
# Suponha que X_train e y_train sejam arrays numpy com os dados e rótulos
X_balanced_train, y_balanced_train = balance_dataset(df_train, target_samples=1102, smote_neighbors=5)
df_train_balanced = pd.concat([X_balanced_train, y_balanced_train], axis=1)
# Mostrar a nova distribuição
print("Distribuição após o balanceamento:", Counter(y_balanced_train))

Distribuição após o balanceamento: Counter({'Bpv Close': 1102, 'Bpv Open': 1102, 'Bypass': 1102, 'Fpv Close': 1102, 'Fpv Open': 1102, 'High': 1102, 'Rad Flow': 1102})


In [25]:
df_train_balanced['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
Bpv Close,1102
Bpv Open,1102
Bypass,1102
Fpv Close,1102
Fpv Open,1102
High,1102
Rad Flow,1102


### 5 Normalização

In [26]:
def norm(df):
  X = df.drop(columns = ['class'])
  y = df['class']
  numeric_columns = X.select_dtypes(include=np.number).columns
  scaler = MinMaxScaler()
  X[numeric_columns] = scaler.fit_transform(X[numeric_columns])
  df = pd.concat([X, y], axis=1)
  return df

In [27]:
df_train_final = norm(df_train_balanced)
df_test_final = norm(df_test)
df_val_final = norm(df_val)

In [28]:
df_train_final.head()

Unnamed: 0,Rad Flow,Fpv Close,Fpv Open,High,Bypass,Bpv Close,Bpv Open,class
2068,0.358974,0.018817,0.762712,0.409524,0.108108,0.201835,0.342857,Bpv Close
4414,0.076923,0.018817,0.559322,0.409524,0.175676,0.321101,0.414286,Bpv Close
9012,0.153846,0.018817,0.677966,0.409524,0.297297,0.220183,0.271429,Bpv Close
9662,0.025641,0.010753,0.627119,0.657143,0.256757,0.220183,0.285714,Bpv Close
11905,0.820513,0.008065,0.694915,0.419048,0.189189,0.431193,0.485714,Bpv Close


In [29]:
df_test_final.head()

Unnamed: 0,Rad Flow,Fpv Close,Fpv Open,High,Bypass,Bpv Close,Bpv Open,class
13514,0.128205,0.918782,0.745763,0.621951,0.558824,0.154545,0.0,Rad Flow
6409,0.128205,0.918782,0.457627,0.512195,0.602941,0.454545,0.222222,Rad Flow
5096,0.076923,0.918782,0.525424,0.341463,0.573529,0.363636,0.174603,Rad Flow
11361,0.051282,0.918782,0.644068,0.219512,0.573529,0.218182,0.047619,Rad Flow
5053,0.128205,0.918782,0.389831,0.365854,0.602941,0.527273,0.285714,Rad Flow


In [30]:
df_val_final.head()

Unnamed: 0,Rad Flow,Fpv Close,Fpv Open,High,Bypass,Bpv Close,Bpv Open,class
13908,0.179487,0.928571,0.559322,0.367347,0.632353,0.351852,0.126984,Rad Flow
9492,0.128205,0.928571,0.762712,0.244898,0.485294,0.12037,0.015873,Rad Flow
16126,0.051282,0.928571,0.627119,0.316327,0.573529,0.240741,0.063492,Rad Flow
17572,0.589744,0.928571,0.830508,0.183673,0.588235,0.212963,0.031746,High
11767,0.153846,0.928571,0.491525,0.683673,0.617647,0.416667,0.190476,Rad Flow


#Armazenando o dataset pré-processado

In [31]:
df_train_final.to_csv('df_train_final.csv', index=False)
df_test_final.to_csv('df_test_final.csv', index=False)
df_val_final.to_csv('df_val_final.csv', index=False)