In [65]:
import pandas as pd
import numpy as np
import statistics
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [66]:
import warnings
warnings.filterwarnings('ignore')

In [67]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [68]:
np.unique(train['Survived'], return_counts=True)

(array([0, 1]), array([549, 342]))

In [69]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [70]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [71]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [72]:
col = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train = train.loc[:, ~train.columns.isin(col)]

In [73]:
train.loc[:, 'Sex'] = train.loc[:, 'Sex'].replace({'male' : 1, 'female' : 0})
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,S
1,1,1,0,38.0,1,0,71.2833,C
2,1,3,0,26.0,0,0,7.925,S
3,1,1,0,35.0,1,0,53.1,S
4,0,3,1,35.0,0,0,8.05,S


In [74]:
# Fazendo o OneHotEncoder da classe Embarked.

encoded_df = pd.get_dummies(train['Embarked'], columns=['Embarked']).astype(int)

print(encoded_df.value_counts())

C  Q  S
0  0  1    644
1  0  0    168
0  1  0     77
   0  0      2
Name: count, dtype: int64


In [75]:
train.drop(columns = ['Embarked'], inplace = True)
train = pd.concat([train, encoded_df], axis = 1)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


In [76]:
train.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
C             0
Q             0
S             0
dtype: int64

In [77]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


In [78]:
# Usando o KNN inputer para preencher os valores não nulos.

imputer = KNNImputer(n_neighbors = 5)
train = pd.DataFrame(imputer.fit_transform(train), columns = train.columns).astype(float)
train.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
C           0
Q           0
S           0
dtype: int64

In [79]:
# Escalonando os dados com MinMaxScaler : 
#scaler = MinMaxScaler()
#x = scaler.fit_transform(train.loc[:, ~train.columns.isin(['Survived'])])
#y = train.loc[:, 'Survived']

In [80]:
# Escalonando os dados com o StandardScaler : 

scaler = StandardScaler()
train.loc[:, ['Age', 'Fare']] = scaler.fit_transform(train.loc[:, ['Age', 'Fare']])

x = train.loc[:, ~train.columns.isin(['Survived'])]
y = train.loc[:, 'Survived']

In [81]:
data_norm = pd.DataFrame(np.column_stack((y, x)), columns = train.columns)

data_norm.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,0.0,3.0,1.0,-0.587949,1.0,0.0,-0.502445,0.0,0.0,1.0
1,1.0,1.0,0.0,0.58226,1.0,0.0,0.786845,1.0,0.0,0.0
2,1.0,3.0,0.0,-0.295397,0.0,0.0,-0.488854,0.0,0.0,1.0
3,1.0,1.0,0.0,0.362846,1.0,0.0,0.42073,0.0,0.0,1.0
4,0.0,3.0,1.0,0.362846,0.0,0.0,-0.486337,0.0,0.0,1.0


In [82]:
data_norm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    891 non-null    float64
 2   Sex       891 non-null    float64
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    float64
 5   Parch     891 non-null    float64
 6   Fare      891 non-null    float64
 7   C         891 non-null    float64
 8   Q         891 non-null    float64
 9   S         891 non-null    float64
dtypes: float64(10)
memory usage: 69.7 KB


In [83]:
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = 0.8)

In [84]:
X_train.shape, X_test.shape

((712, 9), (179, 9))

In [None]:
%%time

# Criando o modelo classificador com o otimizador ADAM : 

mlp_with_adam = MLPClassifier(solver = 'adam', max_iter = 150)

n_neurons = [
    (i, j) for i in range(1, 10) for j in range(1, 5)
]+ [
    (i, j, k) for i in range(1, 10) for j in range(1, 10) for k in range(1, 5)
]

actv = ['logistic', 'tanh', 'relu', 'identity']

parameters = {'hidden_layer_sizes': n_neurons,
             'activation' : actv}

mlp_adam_clf = GridSearchCV(estimator = mlp_with_adam, param_grid = parameters, cv = 10, scoring='accuracy').fit(X_train, y_train)

bparam_adam_hidd = mlp_adam_clf.best_params_['hidden_layer_sizes']
bparam_adam_actv = mlp_adam_clf.best_params_['activation']

print(f"The best param is {bparam_adam_hidd}, {bparam_adam_actv} and the accuracy is : {mlp_adam_clf.best_score_}")

In [None]:
%%time
# Fazendo um loop para pegar a acurácia média do classificador : 

result_ml_adam = list()

mlp_adam = MLPClassifier(hidden_layer_sizes = bparam_adam_hidd, activation = bparam_adam_actv, 
                         solver = 'adam', max_iter = 150)


for i in range(10):
    x, y = shuffle(x, y, random_state = 42)
    result_ml_adam.append(np.mean(cross_val_score(mlp_adam, X_train, y_train, cv = 10)))

result_ml_adam = np.array(result_ml_adam)
print(f"The accuracy for mlp is {np.mean(result_ml_adam)} +- {statistics.stdev(result_ml_adam)}")

In [None]:
# Pegando os dados para a curva AUC gerada pelo modelo bayesiano : 

y_pred_proba_adam = mlp_adam.fit(X_train, y_train).predict_proba(X_test)[:, 1]

fpr_adam, tpr_adam, thresholds_adam = metrics.roc_curve(y_test,  y_pred_proba_adam)
auc_adam = metrics.roc_auc_score(y_test, y_pred_proba_adam)

print(f"The AUC value for the mlp using adam is : {auc_adam}")

In [None]:
(y_pred_proba_adam > 0.5).astype(int)

In [None]:
# Compute accuracy for each threshold
import sys
accuracies_adam = list()
#rint(mlp_adam.predict(test))
#y_pred = (y_pred_proba_adam >= 0.5).astype(int)

#print(y_pred)
#print(accuracy_score(y,y_pred))

for threshold in thresholds_adam:
    y_pred = (y_pred_proba_adam >= threshold).astype(int)
    accuracies_adam.append(accuracy_score(y_test, y_pred))
    #print(accuracy_score(y, y_pred), threshold)
    #print(y_pred)

# Find the threshold with the maximum accuracy
max_acc_adam_idx = np.argmax(accuracies_adam)
opt_thresh_adam = thresholds_adam[max_acc_adam_idx]
max_accuracy_adam = accuracies_adam[max_acc_adam_idx]


print(f'Optimal Threshold: {opt_thresh_adam}')
print(f'Maximum Accuracy: {max_accuracy_adam}')


plt.figure()
plt.plot(fpr_adam, tpr_adam, label='ROC curve')
plt.plot([0, 1], [0, 1], 'k--')
plt.scatter(fpr_adam[max_acc_adam_idx], tpr_adam[max_acc_adam_idx], color='red', label='Optimal Threshold ADAM')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='best')
plt.show()

In [None]:
confusion_matrix_2 = metrics.confusion_matrix(y_test, (y_pred_proba_adam > opt_thresh_adam).astype(int))
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix_2, display_labels = [False, True])
#plt.title(' TEST DATA RESULT')
cm_display.plot()
print(accuracy_score(y_test, (y_pred_proba_adam > opt_thresh_adam).astype(int)))
plt.show()

In [None]:
test = pd.read_csv("test.csv")
test.head()

In [None]:
test.info()

In [None]:
test.describe()

In [None]:
test.info()

In [None]:
PassengerId = test['PassengerId']
col = ['PassengerId', 'Name', 'Ticket', 'Cabin']
test = test.loc[:, ~test.columns.isin(col)]

In [None]:
test.loc[:, 'Sex'] = test.loc[:, 'Sex'].replace({'male' : 1, 'female' : 0})
test.head()

In [None]:
# Realizando o Encoder das variáveis categóricas de teste : 

encoded_df = pd.get_dummies(test['Embarked'], columns=['Embarked']).astype(int)

print(encoded_df.value_counts())

In [None]:
test.drop(columns = ['Embarked'], inplace = True)
test = pd.concat([test, encoded_df], axis = 1)
test.head()

In [None]:
# O KNN já foi definido no código acima, apenas iremos reutilizá-lo : 

test = pd.DataFrame(imputer.fit_transform(test), columns = test.columns)
test = test.astype(float)
test.isna().sum()

In [None]:
#scaler = MinMaxScaler()

#test_norm = scaler.fit_transform(test)
#test_norm = pd.DataFrame(test_norm, columns=test.columns)
#test_norm.head()

In [None]:
test

In [None]:
# Escalonando os dados com o StandardScaler : 

scaler = StandardScaler()

test.loc[:, ['Age', 'Fare']] = scaler.fit_transform(test.loc[:, ['Age', 'Fare']])
test_norm = pd.DataFrame(test, columns=test.columns)
test_norm.head()

In [None]:
# Escalonando os dados com o StandardScaler : 

#scaler = StandardScaler()
#test_norm = scaler.fit_transform(test)
#test_norm = pd.DataFrame(test_norm, columns=test.columns)
#test_norm.head()

#test = scaler.fit(

In [None]:
pred = mlp_adam.predict(test)
np.unique(pred, return_counts = True)

In [None]:
pred

In [None]:
pred = mlp_adam.predict_proba(test)[:, 1]
pred = (pred >= opt_thresh_adam).astype(int)


pred_arr = np.column_stack((PassengerId, pred))

# Create the DataFrame
predictions_df = pd.DataFrame(pred_arr, columns=['PassengerId', 'Survived']).astype(int)
predictions_df.to_csv('predictions.csv', index=False)

In [None]:
np.unique(pred, return_counts=True)

In [None]:
pred