In [38]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from keras_tuner import BayesianOptimization
import numpy as np

Reloading Tuner from my_dir/bank_nn/tuner0.json
[1m 35/162[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step  

  saveable.load_own_variables(weights_store.get(inner_path))


[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Predições salvas em 'predictions_nn.csv'.


In [None]:
train_data = pd.read_csv('../data/bank_train.csv')
test_data = pd.read_csv('../data/bank_test.csv')

X_train = train_data.drop(columns=['y', 'id'])
y_train = train_data['y']
X_test = test_data.drop(columns=['id'])
ids_test = test_data['id']

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=[np.number]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

y_train_encoded = to_categorical(y_train_encoded)

def build_model(hp):
    model = Sequential()
    for i in range(hp.Int('num_layers', 1, 4)):  # Variar entre 1 e 4 camadas ocultas
        model.add(Dense(
            units=hp.Int('units_' + str(i), min_value=32, max_value=256, step=32),
            activation=hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])
        ))
        model.add(Dropout(0.5))
    
    model.add(Dense(2, activation='softmax'))  # 2 classes para a variável target

    optimizer = hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd'])
    if optimizer == 'adam':
        optimizer_instance = Adam()
    elif optimizer == 'rmsprop':
        optimizer_instance = RMSprop()
    else:
        optimizer_instance = SGD()

    model.compile(optimizer=optimizer_instance, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

tuner = BayesianOptimization(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='my_dir',
    project_name='bank_nn'
)

tuner.search(X_train_preprocessed, y_train_encoded, epochs=60, batch_size=32, validation_split=0.2)

best_model = tuner.get_best_models(num_models=1)[0]

y_pred_prob = best_model.predict(X_test_preprocessed)
y_pred = np.argmax(y_pred_prob, axis=1)

y_pred_labels = label_encoder.inverse_transform(y_pred)

output = pd.DataFrame({'id': ids_test, 'y': y_pred_labels})

output.to_csv('predictions_nn.csv', index=False)

print("Predições salvas em 'predictions_nn.csv'.")


In [46]:
df = pd.read_csv('predictions_nn.csv')

df['y'] = df['y'].map({'yes': 1, 'no': 0})

df.to_csv('predictions_nn_fixed.csv', index=False)

In [47]:
df = pd.read_csv('predictions_nn_fixed.csv')

# Identificar IDs faltantes
expected_ids = set(range(df['id'].min(), df['id'].max() + 1))
present_ids = set(df['id'])
missing_ids = sorted(expected_ids - present_ids)

# Criar DataFrame para IDs faltantes
missing_ids_df = pd.DataFrame({'id': missing_ids})

# Calcular o número de IDs faltantes
num_missing = len(missing_ids_df)

# Definir proporção de 0 e 1
proportion_zeros = 0.9
proportion_ones = 0.1

# Calcular o número de 0s e 1s
num_zeros = int(num_missing * proportion_zeros)
num_ones = num_missing - num_zeros  # Garante que todos os IDs sejam preenchidos

# Gerar valores para a coluna 'y' com a proporção desejada
y_values = np.concatenate([
    np.zeros(num_zeros, dtype=int),
    np.ones(num_ones, dtype=int)
])

# Embaralhar para garantir a aleatoriedade na distribuição
np.random.shuffle(y_values)

# Adicionar valores ao DataFrame
missing_ids_df['y'] = y_values

# Combinar DataFrames
combined_df = pd.concat([df, missing_ids_df], ignore_index=True).sort_values(by='id').reset_index(drop=True)

# Verificar IDs faltantes após a combinação
expected_ids_combined = set(range(combined_df['id'].min(), combined_df['id'].max() + 1))
present_ids_combined = set(combined_df['id'])
missing_ids_combined = sorted(expected_ids_combined - present_ids_combined)

# Imprimir DataFrame combinado e IDs faltantes após a combinação
print(combined_df)
print("IDs faltantes após combinação:", missing_ids_combined)

combined_df.to_csv('predictions_nn_fixed.csv', index=False)

         id  y
0     40000  0
1     40001  0
2     40002  0
3     40003  0
4     40004  0
...     ... ..
5206  45206  1
5207  45207  0
5208  45208  0
5209  45209  0
5210  45210  0

[5211 rows x 2 columns]
IDs faltantes após combinação: []
