# Titanic Survival Predictions (Kaggle)

## Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Importing Dataset

In [None]:
training_set = pd.read_csv("train.csv")
test_set = pd.read_csv("test.csv")
training_set.name = 'Training Set'
test_set.name = 'Test Set'
training_set.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
45,46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S
475,476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52.0,A14,S
527,528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S


In [None]:
!pip install fasteda



## Dropping Irrelevant Columns

In [None]:
training_set = training_set.drop("PassengerId", axis='columns')
training_set = training_set.drop("Ticket", axis='columns')
test_set = test_set.drop("Ticket", axis='columns')

## Checking for missing values

In [None]:
def check_missing(df):
    for col in df.columns.tolist():
        print('{} column missing values: {}'.format(col, df[col].isnull().sum()))

for df in [training_set, test_set]:
    check_missing(df)
    print('\n')


Survived column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 177
SibSp column missing values: 0
Parch column missing values: 0
Fare column missing values: 0
Cabin column missing values: 687
Embarked column missing values: 2


PassengerId column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 86
SibSp column missing values: 0
Parch column missing values: 0
Fare column missing values: 1
Cabin column missing values: 327
Embarked column missing values: 0




In [None]:
training_set[training_set['Embarked'].isnull()]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
61,1,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0,B28,
829,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,80.0,B28,


## Dealing with Missing Values and Encoding

### Dealing with missing Embarked values

In [None]:
training_set['Embarked'].fillna('C', inplace=True)
training_set.sample(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
645,1,1,"Harper, Mr. Henry Sleeper",male,48.0,1,0,76.7292,D33,C
708,1,1,"Cleaver, Miss. Alice",female,22.0,0,0,151.55,,S
652,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21.0,0,0,8.4333,,S


### Encoding Embarked Values

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('Embarked_Transformer', OneHotEncoder(), ['Embarked'])], remainder='passthrough')
training_set = pd.DataFrame(ct.fit_transform(training_set), columns=[0, 1, 2, 'Survived', 'PClass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin'])
training_set.sample(2)
test_set['Survived'] = 0
test_set = pd.DataFrame(ct.transform(test_set), columns=[0, 1, 2, 'Survived', 'PClass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin'])
test_set = test_set.drop('Survived', axis='columns')


### Dealing with Cabin Missing Values and Keeping Deck Letter Only

In [None]:
training_set['Cabin'].fillna('Z', inplace=True)
test_set['Cabin'].fillna('Z', inplace=True)

training_set['Cabin'] = [str(i)[0] for i in training_set['Cabin']]
test_set['Cabin'] = [str(i)[0] for i in test_set['Cabin']]
training_set.sample(10)

Unnamed: 0,0,1,2,Survived,PClass,Name,Sex,Age,SibSp,Parch,Fare,Cabin
706,0.0,0.0,1.0,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45.0,0,0,13.5,Z
617,0.0,0.0,1.0,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26.0,1,0,16.1,Z
233,0.0,0.0,1.0,1,3,"Asplund, Miss. Lillian Gertrud",female,5.0,4,2,31.3875,Z
188,0.0,1.0,0.0,0,3,"Bourke, Mr. John",male,40.0,1,1,15.5,Z
257,0.0,0.0,1.0,1,1,"Cherry, Miss. Gladys",female,30.0,0,0,86.5,B
376,0.0,0.0,1.0,1,3,"Landergren, Miss. Aurora Adelia",female,22.0,0,0,7.25,Z
632,1.0,0.0,0.0,1,1,"Stahelin-Maeglin, Dr. Max",male,32.0,0,0,30.5,B
801,0.0,0.0,1.0,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31.0,1,1,26.25,Z
425,0.0,0.0,1.0,0,3,"Wiseman, Mr. Phillippe",male,,0,0,7.25,Z
0,0.0,0.0,1.0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,Z


### Encoding Cabin Column

In [None]:
from sklearn.preprocessing import LabelEncoder
le_cabin = LabelEncoder()
training_set['Cabin'] = le_cabin.fit_transform(training_set['Cabin'])
test_set['Cabin'] = le_cabin.transform(test_set['Cabin'])

### Keeping Titles Only in Name Column

In [None]:
training_set['Initial']=0
for i in training_set:
    training_set['Initial']=training_set.Name.str.extract('([A-Za-z]+)\.')

training_set['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)

test_set['Initial']=0
for i in training_set:
    test_set['Initial']=training_set.Name.str.extract('([A-Za-z]+)\.')

test_set['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)

training_set['Initial'].value_counts()

training_set = training_set.drop('Name', axis='columns')
test_set = test_set.drop('Name', axis='columns')

training_set.sample(10)

Unnamed: 0,0,1,2,Survived,PClass,Sex,Age,SibSp,Parch,Fare,Cabin,Initial
95,0.0,0.0,1.0,0,3,male,,0,0,8.05,8,Mr
479,0.0,0.0,1.0,1,3,female,2.0,0,1,12.2875,8,Miss
475,0.0,0.0,1.0,0,1,male,,0,0,52.0,0,Mr
505,1.0,0.0,0.0,0,1,male,18.0,1,0,108.9,2,Mr
681,1.0,0.0,0.0,1,1,male,27.0,0,0,76.7292,3,Mr
886,0.0,0.0,1.0,0,2,male,27.0,0,0,13.0,8,Other
596,0.0,0.0,1.0,1,2,female,,0,0,33.0,8,Miss
651,0.0,0.0,1.0,1,2,female,18.0,0,1,23.0,8,Miss
534,0.0,0.0,1.0,0,3,female,30.0,0,0,8.6625,8,Miss
335,0.0,0.0,1.0,0,3,male,,0,0,7.8958,8,Mr


### Label Encoding Initial and Sex Columns

In [None]:
le_initial = LabelEncoder()
training_set['Initial'] = le_initial.fit_transform(training_set['Initial'])
test_set['Initial'] = le_initial.transform(test_set['Initial'])
le_sex = LabelEncoder()
training_set['Sex'] = le_sex.fit_transform(training_set['Sex'])
test_set['Sex'] = le_sex.transform(test_set['Sex'])

In [None]:
test_set.sample(2)

Unnamed: 0,0,1,2,PClass,Sex,Age,SibSp,Parch,Fare,Cabin,Initial
117,0.0,0.0,1.0,3,0,1.0,1,1,16.7,6,2
410,0.0,1.0,0.0,3,0,,0,0,7.75,8,2


### Dealing with Fare missing values

In [None]:
from sklearn.impute import KNNImputer
imputer_test = KNNImputer(missing_values=np.nan, n_neighbors=2, weights='uniform')
test_set[['Fare']] = imputer_test.fit_transform(test_set[['Fare']])
training_set.sample(3)

Unnamed: 0,0,1,2,Survived,PClass,Sex,Age,SibSp,Parch,Fare,Cabin,Initial
657,0.0,1.0,0.0,0,3,0,32.0,1,1,15.5,8,3
684,0.0,0.0,1.0,0,2,1,60.0,1,1,39.0,8,2
202,0.0,0.0,1.0,0,3,1,34.0,0,0,6.4958,8,2


### Dealing with Age missing values

In [None]:
training_set.groupby('Initial')['Age'].mean()

Initial
0     4.574167
1    21.860000
2    32.739609
3    35.981818
4    45.888889
Name: Age, dtype: float64

### Setting Age missing value as mean of initial's group

In [None]:
training_set.loc[(training_set['Age'].isnull()) & (training_set['Initial']==0), 'Age'] = 5
training_set.loc[(training_set['Age'].isnull()) & (training_set['Initial']==1), 'Age'] = 22
training_set.loc[(training_set['Age'].isnull()) & (training_set['Initial']==2), 'Age'] = 33
training_set.loc[(training_set['Age'].isnull()) & (training_set['Initial']==3), 'Age'] = 36
training_set.loc[(training_set['Age'].isnull()) & (training_set['Initial']==4), 'Age'] = 46

test_set.loc[(test_set['Age'].isnull()) & (test_set['Initial']==0), 'Age'] = 5
test_set.loc[(test_set['Age'].isnull()) & (test_set['Initial']==1), 'Age'] = 22
test_set.loc[(test_set['Age'].isnull()) & (test_set['Initial']==2), 'Age'] = 33
test_set.loc[(test_set['Age'].isnull()) & (test_set['Initial']==3), 'Age'] = 36
test_set.loc[(test_set['Age'].isnull()) & (test_set['Initial']==4), 'Age'] = 46

In [None]:
training_set.sample(3)

Unnamed: 0,0,1,2,Survived,PClass,Sex,Age,SibSp,Parch,Fare,Cabin,Initial
837,0.0,0.0,1.0,0,3,1,33.0,0,0,8.05,8,2
416,0.0,0.0,1.0,1,2,0,34.0,1,1,32.5,8,3
471,0.0,0.0,1.0,0,3,1,38.0,0,0,8.6625,8,2


## Feature Scaling & Train-Test Split



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X = training_set.drop('Survived', axis='columns')
X = X.drop('Cabin', axis='columns')
test_set = test_set.drop('Cabin', axis='columns')
y = training_set['Survived']
X.columns = X.columns.astype(str)
test_set.columns = test_set.columns.astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)
X_test = sc.transform(X_test)
test_set = sc.transform(test_set)

## Building ANN and Confusion Matrix

In [None]:
!pip install keras-tuner



In [None]:
import tensorflow as tf
import keras_tuner
from sklearn.metrics import confusion_matrix, accuracy_score

def build_model(hp):
  ann = tf.keras.models.Sequential()

  ann.add(tf.keras.layers.Dense(hp.Int('units_1', min_value=16, max_value=256, step=4), input_shape=(9,), activation='relu', kernel_initializer='lecun_uniform'))
  ann.add(tf.keras.layers.Dropout(hp.Float('dropout_rate_1', min_value=0.0, max_value=0.9, step=0.1)))
  ann.add(tf.keras.layers.Dense(hp.Int('units_2', min_value=16, max_value=256, step=4), activation='relu', kernel_initializer='lecun_uniform'))
  ann.add(tf.keras.layers.Dropout(hp.Float('dropout_rate_2', min_value=0.0, max_value=0.9, step=0.1)))
  ann.add(tf.keras.layers.Dense(hp.Int('units_3', min_value=16, max_value=256, step=4), activation='relu', kernel_initializer='lecun_uniform'))
  ann.add(tf.keras.layers.Dropout(hp.Float('dropout_rate_3', min_value=0.0, max_value=0.9, step=0.1)))
  ann.add(tf.keras.layers.Dense(hp.Int('units_4', min_value=0, max_value=256, step=4)))
  ann.add(tf.keras.layers.Dropout(hp.Float('dropout_rate_4', min_value=0.0, max_value=0.9, step=0.1)))

  ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
  ann.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Float('learning_rate', min_value=0.001, max_value=0.05, step=0.001)), loss='binary_crossentropy', metrics=['accuracy'])
  return ann

X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)
X_val = np.asarray(X_val).astype(np.float32)
y_val = np.asarray(y_val).astype(np.float32)

tuner = keras_tuner.BayesianOptimization(build_model,
                                 objective=['accuracy','val_accuracy'],
                                 max_trials=250)
tuner.search(X_train, y_train, validation_data=(X_val, y_val))
best_models = tuner.get_best_models(num_models=5)
tuner.results_summary()


Trial 250 Complete [00h 00m 04s]
multi_objective: -1.4738774299621582

Best multi_objective So Far: -1.661314606666565
Total elapsed time: 00h 17m 38s
Results summary
Results in my_dir/tuning_model
Showing 10 best trials
MultiObjective(name="multi_objective", direction="min"): [Objective(name="accuracy", direction="max"), Objective(name="val_accuracy", direction="max")]

Trial 060 summary
Hyperparameters:
units_1: 256
dropout_rate_1: 0.0
units_2: 256
dropout_rate_2: 0.0
units_3: 160
dropout_rate_3: 0.0
units_4: 168
dropout_rate_4: 0.0
learning_rate: 0.001
Score: -1.661314606666565

Trial 067 summary
Hyperparameters:
units_1: 256
dropout_rate_1: 0.0
units_2: 256
dropout_rate_2: 0.0
units_3: 88
dropout_rate_3: 0.0
units_4: 256
dropout_rate_4: 0.0
learning_rate: 0.001
Score: -1.6612566709518433

Trial 069 summary
Hyperparameters:
units_1: 184
dropout_rate_1: 0.0
units_2: 256
dropout_rate_2: 0.0
units_3: 152
dropout_rate_3: 0.0
units_4: 256
dropout_rate_4: 0.0
learning_rate: 0.001
Score: -

## Running Tuned ANN

In [None]:
for j in range(0,5):
  best_models[j].fit(X_train, y_train, epochs=30, batch_size=32)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 2

## Confusion Matrix & Accuracy Score for ANN

In [None]:
for j in range(0,5):
  y_init_pred = best_models[j].predict(X_test) > 0.5
  y_init_pred = np.array([int(i) for i in y_init_pred])
  cfm = confusion_matrix(y_test, y_init_pred)
  acc = (accuracy_score(y_test, y_init_pred))
  print(cfm)
  print(f"Accuracy score for model {j}", acc)

[[72 10]
 [17 35]]
Accuracy score for model 0 0.7985074626865671
[[78  4]
 [18 34]]
Accuracy score for model 1 0.835820895522388
[[72 10]
 [17 35]]
Accuracy score for model 2 0.7985074626865671
[[76  6]
 [17 35]]
Accuracy score for model 3 0.8283582089552238
[[75  7]
 [16 36]]
Accuracy score for model 4 0.8283582089552238


## Applying 10-Fold CV for ANN

In [None]:
!pip install tensorflow scikeras scikit-learn

In [None]:
from sklearn.model_selection import cross_val_score
from scikeras.wrappers import KerasClassifier
keras_clf = KerasClassifier(best_models[0])
cvs = cross_val_score(keras_clf, X=X, y=y, scoring='accuracy', cv=10, n_jobs=-1, error_score='raise')
print("Accuracy: {:.2f}%".format(cvs.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs.std()*100))

ValueError: ignored

## Applying RandomizedSearchCV to tune hyperparameters of ANN

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# parameters = {
#     'epochs': [20,40,60,80,100,120,140,160],
#     'batch_size': [16, 32, 48, 64, 128],
#     'optimizer__learning_rate': [0.001, 0.01, 0.1],
#     'optimizer': ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam'],
# }

# gscv = RandomizedSearchCV(keras_clf, param_distributions=parameters, scoring='accuracy', n_jobs=-1, cv=10, n_iter=50, error_score='raise')
# gscv.fit(X_train, y_train)
# best_parameters = gscv.best_params_
# best_score = gscv.best_score_
# print("Best Accuracy: {:.2f}%".format(best_score*100))
# print("Best Parameters: {}".format(best_parameters))

## Applying Gaussian NB Model

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

## Applying 10-fold CV for GNB

In [None]:
cvs_gnb = cross_val_score(gnb, X=X_train, y=y_train, scoring='accuracy', cv=10, n_jobs=-1)
print("Accuracy: {:.2f}%".format(cvs_gnb.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs_gnb.std()*100))

Accuracy: 77.81%
Accuracy Standard Deviation: 3.39%


## Applying XGBoost model

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

## Applying 10-Fold CV for XGBoost

In [None]:
cvs_cb = cross_val_score(classifier, X=X_train, y=y_train, scoring='accuracy', cv=10, n_jobs=-1)
print("Accuracy: {:.2f}%".format(cvs_cb.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs_cb.std()*100))

Accuracy: 81.32%
Accuracy Standard Deviation: 4.54%


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


## Applying CatBoost

In [None]:
from catboost import CatBoostClassifier
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV

catboost_clf = CatBoostClassifier()
catboost_hparams = {
    'n_estimators': [
        50, 100, 150, 200, 250, 300
    ],
    'depth': [
        4, 6, 8, 10, 12
    ],
    'learning_rate': stats.uniform(
        loc=0.001, scale=0.099
    ),
    'l2_leaf_reg': stats.uniform(
        loc=1, scale=9
    )
}

catboost_cv = RandomizedSearchCV(
    estimator=catboost_clf,
    param_distributions=catboost_hparams,
    n_iter=50,
    scoring='accuracy',
    n_jobs=-1,
    cv=10,
    verbose=0,
)

catboost_cv.fit(X=X, y=y)

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


0:	learn: 0.6894563	total: 47.3ms	remaining: 4.68s
1:	learn: 0.6859420	total: 49ms	remaining: 2.4s
2:	learn: 0.6827966	total: 60.2ms	remaining: 1.95s
3:	learn: 0.6794376	total: 63.8ms	remaining: 1.53s
4:	learn: 0.6764414	total: 69ms	remaining: 1.31s
5:	learn: 0.6727997	total: 70.9ms	remaining: 1.11s
6:	learn: 0.6692590	total: 72.7ms	remaining: 966ms
7:	learn: 0.6659203	total: 78.9ms	remaining: 907ms
8:	learn: 0.6629546	total: 84.8ms	remaining: 858ms
9:	learn: 0.6595828	total: 88.7ms	remaining: 798ms
10:	learn: 0.6569336	total: 101ms	remaining: 814ms
11:	learn: 0.6537199	total: 102ms	remaining: 749ms
12:	learn: 0.6505416	total: 103ms	remaining: 692ms
13:	learn: 0.6481248	total: 114ms	remaining: 700ms
14:	learn: 0.6458176	total: 126ms	remaining: 712ms
15:	learn: 0.6426692	total: 128ms	remaining: 674ms
16:	learn: 0.6403553	total: 135ms	remaining: 660ms
17:	learn: 0.6370280	total: 138ms	remaining: 630ms
18:	learn: 0.6344620	total: 147ms	remaining: 627ms
19:	learn: 0.6315505	total: 150ms	re

In [None]:
from sklearn.model_selection import cross_val_score
cvs_cat = cross_val_score(catboost_cv, X=X, y=y, scoring='accuracy', cv=10, n_jobs=-1)

TerminatedWorkerError: ignored

In [None]:
print("Accuracy: {:.2f}%".format(cvs_cat.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs_cat.std()*100))

## Applying Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(criterion='entropy', max_depth=15, n_jobs=-1)
rfc.fit(X_train, y_train)

## Applying 10-Fold CV for RFC

In [None]:
cvs_rfc = cross_val_score(rfc, X=X_train, y=y_train, scoring='accuracy', cv=10, n_jobs=-1)
print("Accuracy: {:.2f}%".format(cvs_rfc.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs_rfc.std()*100))

Accuracy: 81.75%
Accuracy Standard Deviation: 5.64%


## Implementing SVM Classification

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf', C=0.25, gamma=0.2)
svc.fit(X_train, y_train)


## Applying 10-Fold CV for SVC

In [None]:
cvs_svc = cross_val_score(svc, X=X_train, y=y_train, scoring='accuracy', cv=10, n_jobs=-1)
print("Accuracy: {:.2f}%".format(cvs_svc.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs_svc.std()*100))

Accuracy: 83.43%
Accuracy Standard Deviation: 4.74%


## Applying Grid Search CV for Hyperparameter Tuning of SVC

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{
    'C': [0.25, 0.5, 0.75, 1],
    'kernel': ['linear'],
              },
              {
                'C': [0.25, 0.5, 0.75, 1],
                'kernel': ['rbf'],
                'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
              }]
gs = GridSearchCV(classifier, param_grid=parameters, scoring='accuracy', cv=10, n_jobs = -1)
gs.fit(X_train, y_train)
best_accuracy = gs.best_score_
best_params = gs.best_params_
print("Best Accuracy: {:.2f}%".format(best_accuracy*100))
print("Best Parameters: {}".format(best_params))

Best Accuracy: 83.84%
Best Parameters: {'C': 0.25, 'gamma': 0.2, 'kernel': 'rbf'}


## Applying KNN Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=6, weights='distance')
knc.fit(X_train, y_train)

## Applying 10-fold CV for KNN Classification

In [None]:
cvs_knc = cross_val_score(knc, X=X_train, y=y_train, scoring='accuracy', cv=10, n_jobs=-1)
print("Accuracy: {:.2f}%".format(cvs_svc.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs_svc.std()*100))

Accuracy: 83.43%
Accuracy Standard Deviation: 4.74%


## Output

ANN Model is Chosen due to better accuracy average score and a lower standard deviation during CV. Now we can use ANN to predict test set

In [None]:
y_pred = best_models[3].predict(test_set) > 0.5
y_pred = [int(i) for i in y_pred]
print(y_pred)

[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 

In [None]:
test = pd.read_csv("test.csv")
passenid = test['PassengerId']
print(passenid)
print(y_pred)
output = pd.DataFrame({'PassengerId': passenid, 'Survived': y_pred})
output.to_csv('submission_without_fit2.csv', index=False)

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64
[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 