In [1]:
import pandas as pd
import os
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from tpot import TPOTClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
train_file_path = os.path.join('..', 'data', 'bank_train.csv')
test_file_path = os.path.join('..', 'data', 'bank_test.csv')

train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

# Identificar colunas categóricas e numéricas
categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()
categorical_features.remove('y')  # Remover a target da lista de features categóricas
numerical_features = train_df.select_dtypes(exclude=['object']).columns.tolist()
numerical_features.remove('id')  # Remover 'id' da lista de features numéricas

# Criar o pré-processador para OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Remover a coluna 'id' do treino e teste
X = train_df.drop(columns=['id', 'y'])
y = train_df['y']
X_test = test_df.drop(columns=['id'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Criar o pipeline com o pré-processador e TPOT
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('tpot', TPOTClassifier(verbosity=2, generations=10, population_size=40, random_state=42))
])

pipeline.fit(X_train, y_train)

y_val_pred = pipeline.predict(X_val)

val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Acurácia no conjunto de validação: {val_accuracy}')

predictions = pipeline.predict(X_test)

# Criar o DataFrame para as predições
output = pd.DataFrame({'id': test_df['id'], 'y': predictions})

output.to_csv('predictions.csv', index=False)

pipeline.named_steps['tpot'].export('tpot_pipeline.py')


                                                                               
Generation 1 - Current best internal CV score: 0.9009667302834556
                                                                                
Generation 2 - Current best internal CV score: 0.9018169454660544
Optimization Progress:  28%|██▊       | 125/440 [31:57<2:12:42, 25.28s/pipeline]

In [None]:
df = pd.read_csv('predictions.csv')

df['y'] = df['y'].map({'yes': 1, 'no': 0})

df.to_csv('predictions_fixed.csv', index=False)

In [None]:
df = pd.read_csv('predictions_fixed.csv')

# Identificar IDs faltantes
expected_ids = set(range(df['id'].min(), df['id'].max() + 1))
present_ids = set(df['id'])
missing_ids = sorted(expected_ids - present_ids)

# Criar DataFrame para IDs faltantes
missing_ids_df = pd.DataFrame({'id': missing_ids})

# Calcular o número de IDs faltantes
num_missing = len(missing_ids_df)

# Definir proporção de 0 e 1
proportion_zeros = 0.9
proportion_ones = 0.1

# Calcular o número de 0s e 1s
num_zeros = int(num_missing * proportion_zeros)
num_ones = num_missing - num_zeros  # Garante que todos os IDs sejam preenchidos

# Gerar valores para a coluna 'y' com a proporção desejada
y_values = np.concatenate([
    np.zeros(num_zeros, dtype=int),
    np.ones(num_ones, dtype=int)
])

# Embaralhar para garantir a aleatoriedade na distribuição
np.random.shuffle(y_values)

# Adicionar valores ao DataFrame
missing_ids_df['y'] = y_values

# Combinar DataFrames
combined_df = pd.concat([df, missing_ids_df], ignore_index=True).sort_values(by='id').reset_index(drop=True)

# Verificar IDs faltantes após a combinação
expected_ids_combined = set(range(combined_df['id'].min(), combined_df['id'].max() + 1))
present_ids_combined = set(combined_df['id'])
missing_ids_combined = sorted(expected_ids_combined - present_ids_combined)

# Imprimir DataFrame combinado e IDs faltantes após a combinação
print(combined_df)
print("IDs faltantes após combinação:", missing_ids_combined)

combined_df.to_csv('predictions_fixed.csv', index=False)

         id  y
0     40000  0
1     40001  0
2     40002  1
3     40003  0
4     40004  0
...     ... ..
5206  45206  1
5207  45207  0
5208  45208  0
5209  45209  0
5210  45210  0

[5211 rows x 2 columns]
IDs faltantes após combinação: []
