In [None]:
import numpy as np
import pandas  as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, accuracy_score
import warnings as ww
ww.filterwarnings('ignore')
sns.set_style('darkgrid')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train.head(3)

In [None]:
test.head(3)

In [None]:
print(f'train shape: {train.shape}')
print(f'test shape: {test.shape}')

In [None]:
train['Df'] = 'Train'
test['Df'] = 'Test'

In [None]:
df = pd.concat([train, test])
df.sample(5, random_state=88)

In [None]:
# checking duplicated
df.duplicated().sum()

In [None]:
# checking empty values
df.isna().mean()

In [None]:
plt.figure(figsize=(10,7))
plt.title('ISNA MAP', loc='left', weight='bold', size=14)
sns.heatmap(df.isna().T)
plt.show()

In [None]:
df.drop(columns=['Cabin','Name'], inplace=True)

**INSIGHT**
Due to the proportion of empty values ​​in the Cabin variable (77%) and the uselessness of the Name variable, they will be excluded from the dataset.

#### EDA - Categorical Variables
Analysis of categorical variables

In [None]:
categorical = df.dtypes[df.dtypes==object].reset_index().iloc[:,0].to_list()
categorical = ['Sex', 'Ticket', 'Embarked']
print(categorical)

In [None]:
df[categorical].describe()

In [None]:
print(f'{df["Sex"].value_counts()}\n')
print(25*'=')
print(f'{df["Embarked"].value_counts()}')
print(25*'=')

#### EDA Numerical Variables
Analysis of numerical variables

In [None]:
numerical = df.dtypes[df.dtypes!=object].reset_index().iloc[:,0].to_list()
numerical = ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
print(numerical)

In [None]:
df[numerical].describe().T.round(1)

In [None]:
# distribuition of numerical variables

sns.set_style('darkgrid')
plt.figure(figsize=(16,6))
for i in range(0,len(numerical)):
	plt.subplot(2,3,i+1)
	sns.histplot(df[numerical[i]], kde=True, edgecolor='white')
	plt.title(numerical[i], loc='left', weight='bold', size=14)
	plt.xticks(size=8)
	plt.yticks(size=8)
	plt.ylabel('frequency', size=9)
	plt.xlabel(numerical[i], size=9)
plt.tight_layout()

**INSIGHT**
The variables: age, sibsp, parch and fare apparently have outliers, I will check them later using zscore.

In [None]:
# checking zscore outliers

regs = np.array([True] * len(df))
for col in numerical:
	zscores = abs(stats.zscore(df[col]))
	regs = (zscores < 3) & regs

regs.value_counts()

**INSIGHT**
As there are no outliers with more than three zscores, I choose not to change the data in these columns.

In [None]:
# checking correlations
df.corr(numeric_only=True)

In [None]:
plt.figure(figsize=(7,4))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='Blues', linewidths=0.5)
plt.title('CORRELATION MAP', loc='left', weight='bold', size=14)
plt.show()

**INSIGHT**
The variables themselves have a strong correlation with the target variable, I will keep them all for modeling.

In [None]:
df.groupby(['Survived','Embarked','Sex']).agg({'Age':'describe'}).reset_index()

In [None]:
df['Age'] = np.where(	
		(df['Age'].isna()) & (df['Survived'] == 0) & (df['Embarked'] == 'C') & (df['Sex'] == 'female'), 26,
	np.where(
		(df['Age'].isna()) & (df['Survived'] == 0) & (df['Embarked'] == 'C') & (df['Sex'] == 'male'), 35,
	np.where(
		(df['Age'].isna()) & (df['Survived'] == 0) & (df['Embarked'] == 'Q') & (df['Sex'] == 'female'), 28,
	np.where(
		(df['Age'].isna()) & (df['Survived'] == 0) & (df['Embarked'] == 'Q') & (df['Sex'] == 'male'), 31,
	np.where(
		(df['Age'].isna()) & (df['Survived'] == 0) & (df['Embarked'] == 'S') & (df['Sex'] == 'female'), 24,
	np.where(
		(df['Age'].isna()) & (df['Survived'] == 0) & (df['Embarked'] == 'S') & (df['Sex'] == 'male'), 31,
	np.where(
		(df['Age'].isna()) & (df['Survived'] == 1) & (df['Embarked'] == 'C') & (df['Sex'] == 'female'), 29,
	np.where(
		(df['Age'].isna()) & (df['Survived'] == 1) & (df['Embarked'] == 'C') & (df['Sex'] == 'male'), 30,
	np.where(
		(df['Age'].isna()) & (df['Survived'] == 1) & (df['Embarked'] == 'Q') & (df['Sex'] == 'female'), 22,
	np.where(
		(df['Age'].isna()) & (df['Survived'] == 1) & (df['Embarked'] == 'Q') & (df['Sex'] == 'male'), 29,
	np.where(
		(df['Age'].isna()) & (df['Survived'] == 1) & (df['Embarked'] == 'S') & (df['Sex'] == 'female'), 29,
	np.where(
		(df['Age'].isna()) & (df['Survived'] == 1) & (df['Embarked'] == 'S') & (df['Sex'] == 'male'), 26, df['Age']))))))))))))
df['Age'].fillna(df['Age'].mean(), inplace=True)

In [None]:
df.isna().sum()

In [None]:
df['Fare'].fillna(df['Fare'].mean(), inplace=True)

In [None]:
df.describe().T.round(1)

### PRE-PROCESS
Data pre-processing to build models

In [None]:
df.sample(3)

In [None]:
cat_to_encoding = ['Sex','Embarked']

In [None]:
le = LabelEncoder()

for col in cat_to_encoding:
	df[col] = le.fit_transform(df[col])

In [None]:
df.sample(5, random_state=99)

In [None]:
df['Ticket'].nunique()

In [None]:
df['Ticket'] = np.where(
		(df['Ticket'].str.isalnum()), df['Ticket'], df['Ticket'].str.split(' ').str[1])

In [None]:
df['Ticket'] = pd.to_numeric(df['Ticket'], errors='coerce')

In [None]:
df.isna().sum()

In [None]:
df['Ticket'].fillna(df['Ticket'].mean(), inplace=True)

In [None]:
df.corr(numeric_only=True)

In [None]:
df.columns

In [None]:
standardizer = StandardScaler()

In [None]:
df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked']] = standardizer.fit_transform(df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked']])

In [None]:
df = df[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked', 'Df', 'Survived']]

In [None]:
df.sample(5)

### TRAIN AND TEST DATA
Division of the basis for training and testing models

In [None]:
X = df[df['Df']=='Train'].drop(columns=['Df','Survived'])
Y = df[df['Df']=='Train']['Survived']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.75, random_state=88)
print(x_train.shape, y_train.shape,  x_test.shape,  y_test.shape)

In [None]:
y_test.value_counts()

### CLASS BALANCING
The classes of the target variable are unbalanced, this will harm the training of the models and their predictions, I will perform the balancing using the OVERSAMPLING technique.

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
over_sampler = SMOTE(k_neighbors=2)

In [None]:
x_train, y_train = over_sampler.fit_resample(x_train, y_train)

In [None]:
y_train.value_counts()

### MODEL 1: LOGISTIC REGERSSION

In [None]:
# tuned params
tuned_params_v1 = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
				   'penalty': ['l1', 'l2']}

In [None]:
model_v1 = GridSearchCV(LogisticRegression(),
						tuned_params_v1,
						scoring = 'roc_auc')

In [None]:
model_v1.fit(x_train, y_train)

In [None]:
model_v1.best_estimator_

In [None]:
best_v1 = str(model_v1.best_estimator_)
best_v1 = best_v1.split('=')[1].split(')')[0]
best_v1 = pd.to_numeric(best_v1)
print(best_v1)

In [None]:
model_v1 = LogisticRegression(C=best_v1)

In [None]:
model_v1.fit(x_train, y_train)

In [None]:
y_pred_v1 = model_v1.predict(x_test)

In [None]:
y_pred_proba_v1 = model_v1.predict_proba(x_test)[:,1]

In [None]:
# ROC AUC
roc_auc_v1 = roc_auc_score(y_test, y_pred_v1)
print(roc_auc_v1)

In [None]:
# ROC CURVE
fpr_v1, tpr_v1, thresholds = roc_curve(y_test, y_pred_v1)
print(fpr_v1, tpr_v1, thresholds)

In [None]:
# AUC
auc_v1 = auc(fpr_v1, tpr_v1)
print(auc_v1)

In [None]:
# ACCURARY
accuracy_v1 = accuracy_score(y_test, y_pred_v1)
print(accuracy_v1)

In [None]:
indexes = np.argsort(-abs(model_v1.coef_[0,:]))

print('MOST FEATURE IMPORTANCE')
print(25*'=')
for feature in enumerate(X.columns[indexes], start=1):
	print(feature)
print(25*'=')

In [None]:
Passenger =  180

print('\nEVALUATION OF MODEL PREDICTIONS - MODEL_V1')
print(42*'=')
print(f'Legend:\n0 - Not survived\n1 - Survivied ')
print(42*'=')
print(f'Chosen passenger: {Passenger}')
print(f'Real class: {y_test.iloc[Passenger]}')
print(f'Predict class {y_pred_v1[Passenger]}')
print(f'Probability of surviving: {y_pred_proba_v1[Passenger] * 100:.2f}%')
print(42*'=')

In [None]:
dict_v1 = {'Model':'Modelo_v1',
		   'Algorithm': 'LogisticRegression',
		   'ROC_AUC': roc_auc_v1,
		   'AUC': auc_v1,
		   'Accuracy': accuracy_v1}
df_resuls = pd.DataFrame([dict_v1])
df_resuls

### MODEL 2: RANDOM FOREST

In [None]:
tuned_params_v2 = {'n_estimators': [100,200,300,400,500,600],
				   'min_samples_split': [2,5,10],
				   'min_samples_leaf': [1,2,4]}

In [None]:
model_v2 = RandomizedSearchCV(RandomForestClassifier(),
							   tuned_params_v2,
							   n_iter = 15,
							   scoring = 'roc_auc',
							   n_jobs = -1)

In [None]:
model_v2.fit(x_train,  y_train)

In [None]:
model_v2.best_estimator_

In [None]:
best_v2 = str(model_v2.best_estimator_).split('=')[-1].split(')')[0]
best_v2 = pd.to_numeric(best_v2)
print(best_v2)

In [None]:
model_v2 = RandomForestClassifier(n_estimators=best_v2)

In [None]:
model_v2.fit(x_train, y_train)

In [None]:
y_pred_v2 = model_v2.predict(x_test)
y_pred_proba_v2 = model_v2.predict_proba(x_test)[:,1]

In [None]:
roc_auc_v2 = roc_auc_score(y_test, y_pred_v2)
print(roc_auc_v2)

In [None]:
fpr_v2, tpr_v2, thresholds = roc_curve(y_test, y_pred_proba_v2)
auc_v2 = auc(fpr_v2, tpr_v2)
print(auc_v2)

In [None]:
accuracy_v2 = accuracy_score(y_test, y_pred_v2)
print(accuracy_v2)

In [None]:
indexes = np.argsort(-abs(model_v2.feature_importances_))

print('MOST FEATURE IMPORTANCE')
print(25*'=')
for feature in enumerate(X.columns[indexes], start=1):
	print(feature)
print(25*'=')

In [None]:
Passenger =  180

print('\nEVALUATION OF MODEL PREDICTIONS - MODEL_V2')
print(42*'=')
print(f'Legend:\n0 - Not survived\n1 - Survivied ')
print(42*'=')
print(f'Chosen passenger: {Passenger}')
print(f'Real class: {y_test.iloc[Passenger]}')
print(f'Predict class {y_pred_v2[Passenger]}')
print(f'Probability of surviving: {y_pred_proba_v2[Passenger] * 100:.2f}%')
print(42*'=')

In [None]:
dict_v2 = {'Model':'Modelo_v2',
		   'Algorithm': 'RandomForestClassifier',
		   'ROC_AUC': roc_auc_v2,
		   'AUC': auc_v2,
		   'Accuracy': accuracy_v2}
df_resuls = pd.DataFrame([dict_v1,dict_v2])
df_resuls

### M0DEL 3: KNN

In [None]:
neighbor = list(range(1,22,2))
neighbor

In [None]:
cv_score = []
for k in neighbor:
	knn = KNeighborsClassifier(n_neighbors=k)
	score = cross_val_score(knn, x_train, y_train, cv=5, scoring='accuracy')
	cv_score.append(score.mean())
cv_score

In [None]:
optimal_k = neighbor[cv_score.index(max(cv_score))]
print(f'The ideal k neighbor is: {optimal_k}')

In [None]:
model_v3 = KNeighborsClassifier(n_neighbors = optimal_k)

In [None]:
model_v3.fit(x_train, y_train)

In [None]:
y_pred_v3 = model_v3.predict(x_test)
y_pred_proba_v3 = model_v3.predict_proba(x_test)[:,1]

In [None]:
roc_auc_v3 = roc_auc_score(y_test, y_pred_v3)
print(roc_auc_v3)

In [None]:
fpr_v3, tpr_v3, thresholds = roc_curve(y_test, y_pred_proba_v3)

In [None]:
auc_v3 = auc(fpr_v3, tpr_v3)
print(auc_v3)

In [None]:
accuracy_v3 = accuracy_score(y_test, y_pred_v3)
print(accuracy_v3)

In [None]:
Passenger =  180

print('\nEVALUATION OF MODEL PREDICTIONS - MODEL_V3')
print(42*'=')
print(f'Legend:\n0 - Not survived\n1 - Survivied ')
print(42*'=')
print(f'Chosen passenger: {Passenger}')
print(f'Real class: {y_test.iloc[Passenger]}')
print(f'Predict class {y_pred_v3[Passenger]}')
print(f'Probability of surviving: {y_pred_proba_v3[Passenger] * 100:.2f}%')
print(42*'=')

In [None]:
dict_v3 = {'Model':'Modelo_v3',
		   'Algorithm': 'KNeighborsClassifier',
		   'ROC_AUC': roc_auc_v3,
		   'AUC': auc_v3,
		   'Accuracy': accuracy_v3}
df_resuls = pd.DataFrame([dict_v1,dict_v2,dict_v3])
df_resuls

### MODEL 4: DECISION TREE

In [None]:
tuned_params_v4 = {'min_samples_split': [2,3,4,5,7],
				   'min_samples_leaf': [1,2,3,4,6],
				   'max_depth': [2,3,4,5,6,7]}

In [None]:
model_v4 = RandomizedSearchCV(DecisionTreeClassifier(),
							  tuned_params_v4,
							  n_iter=15,
							  scoring='roc_auc',
							  n_jobs=-1)

In [None]:
model_v4.fit(x_train, y_train)

In [None]:
model_v4.best_estimator_

In [None]:
model_v4 = DecisionTreeClassifier(max_depth=3)

In [None]:
model_v4.fit(x_train, y_train)

In [None]:
y_pred_v4 = model_v4.predict(x_test)
y_pred_proba_v4 = model_v4.predict_proba(x_test)[:,1]

In [None]:
roc_auc_v4 = roc_auc_score(y_test, y_pred_v4)
print(roc_auc_v4)

In [None]:
fpr_v4, tpr_v4, thresholds = roc_curve(y_test, y_pred_proba_v4)

In [None]:
auc_v4 = auc(fpr_v4, tpr_v4)
print(auc_v4)

In [None]:
accuracy_v4 = accuracy_score(y_test, y_pred_v4)
print(accuracy_v4)

In [None]:
indexes = np.argsort(-model_v4.feature_importances_)

print('MOST FEATURE IMPORTANCE')
print(25*'=')
for feature in enumerate(X.columns[indexes], start=1):
	print(feature)
print(25*'=')

In [None]:
Passenger =  180

print('\nEVALUATION OF MODEL PREDICTIONS - MODEL_V4')
print(42*'=')
print(f'Legend:\n0 - Not survived\n1 - Survivied ')
print(42*'=')
print(f'Chosen passenger: {Passenger}')
print(f'Real class: {y_test.iloc[Passenger]}')
print(f'Predict class {y_pred_v4[Passenger]}')
print(f'Probability of surviving: {y_pred_proba_v4[Passenger] * 100:.2f}%')
print(42*'=')

In [None]:
dict_v4 = {'Model':'Modelo_v4',
		   'Algorithm': 'DecisionTreeClassifier',
		   'ROC_AUC': roc_auc_v4,
		   'AUC': auc_v4,
		   'Accuracy': accuracy_v4}
df_resuls = pd.DataFrame([dict_v1,dict_v2,dict_v3,dict_v4])
df_resuls

### MODEL 5: SVM

In [None]:
def svc_param_selection(X, Y, nfolds):
	Cs = [0.001,  0.01, 0.1, 1, 10]
	gammas = [0.001, 0.01, 0.1, 1]
	param_grid = {'C': Cs, 'gamma': gammas}
	grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=nfolds)
	grid_search.fit(x_train, y_train)
	grid_search.best_params_
	return grid_search.best_params_

In [None]:
best_v5 = svc_param_selection(x_train, y_train, 5)
best_v5

In [None]:
best_v5 = pd.DataFrame([best_v5])
C = best_v5['C']
C = C.max()
C

In [None]:
G = best_v5['gamma']
G = G.max()
G

In [None]:
model_v5 = SVC(C=C, gamma=G, probability=True)

In [None]:
model_v5.fit(x_train, y_train)

In [None]:
y_pred_v5 = model_v5.predict(x_test)
y_pred_proba_v5 = model_v5.predict_proba(x_test)[:,1]

In [None]:
roc_auc_v5 = roc_auc_score(y_test, y_pred_v5)
print(roc_auc_v5)

In [None]:
fpr_v5, tpr_v5, thresholds = roc_curve(y_test, y_pred_proba_v5)

In [None]:
auc_v5 = auc(fpr_v5, tpr_v5)
print(auc_v5)

In [None]:
accuracy_v5 = accuracy_score(y_test, y_pred_v5)
print(accuracy_v5)

In [None]:
Passenger =  180

print('\nEVALUATION OF MODEL PREDICTIONS - MODEL_V5')
print(42*'=')
print(f'Legend:\n0 - Not survived\n1 - Survivied ')
print(42*'=')
print(f'Chosen passenger: {Passenger}')
print(f'Real class: {y_test.iloc[Passenger]}')
print(f'Predict class {y_pred_v5[Passenger]}')
print(f'Probability of surviving: {y_pred_proba_v5[Passenger] * 100:.2f}%')
print(42*'=')

In [None]:
dict_v5 = {'Model':'Modelo_v5',
		   'Algorithm': 'SupportVectorMachines',
		   'ROC_AUC': roc_auc_v5,
		   'AUC': auc_v4,
		   'Accuracy': accuracy_v5}
df_resuls = pd.DataFrame([dict_v1,dict_v2,dict_v3,dict_v4,dict_v5])
df_resuls

### ROC CURVE MODELS

In [None]:
models = {
	'LogisticRegression': model_v1,
	'RandomForestClassifier': model_v2,
	'KNeighborsClassifier': model_v3,
	'DecisionTreeClassifier': model_v4,
	'SupportVectorMachines': model_v5}

In [None]:
plt.figure(figsize=(10, 8))

for model_name, model in models.items():
    model.fit(x_train, y_train)
    y_probs = model.predict_proba(x_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_probs)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate (FPR)', size=11)
plt.ylabel('True Positive Rate (TPR)', size=11)
plt.title('ROC CURVE OF THE MODELS', size=18, weight='bold')
plt.legend(loc='lower right', facecolor='white')
plt.text(x=0.83, y=-0.15, s='By Claudio Sturaro Martinez Junior', color='purple')
plt.tight_layout()

### BEST MODEL BY ACCURACY FOR THE COMPETITION KAGGLE
The metric in Kaggle is accuracy, so I will select the best model based on accuracy and, if more than one algorithm presents the same result, the tiebreaker would be the AUC metric.

In [None]:
best_model = df_resuls[ (df_resuls['Accuracy'] == df_resuls['Accuracy'].max()) & (df_resuls['AUC'] == df_resuls['AUC'].max())]
best_model

In [None]:
# output to kaggle competition
x_out_test = df[df['Df']=='Test'][['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked']]

In [None]:
x_out_test.sample(5, random_state=88)

In [None]:
y_pred_out = model_v2.predict(x_out_test)
y_pred_out[:50]

In [None]:
submission = pd.DataFrame({'PassengerId': x_out_test['PassengerId'], 'Survived': y_pred_out})
submission.to_csv()
submission