# Importing required libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Importing data from csv

In [6]:
train_dataset = pd.read_csv('./train.csv')
test_dataset = pd.read_csv('./test.csv')

# Spliting independant and dependant variables

In [7]:
X_train = train_dataset.iloc[:, [2,4,5,6,7,9,11]].values
y_train = train_dataset.iloc[:, 1].values

# Converting test data into numpy array

In [4]:
X_test = test_dataset.iloc[:, [1,3,4,5,6,8,10]].values

# Preprocessing data

## Converting Gender from categorical to binary variable

In [3]:
label_encoder_gender = LabelEncoder()
X_train[:, 1] = label_encoder_gender.fit_transform(X_train[:, 1])
X_test[:, 1] = label_encoder_gender.transform(X_test[:, 1])

NameError: name 'X_train' is not defined

## Filling missing values of emarked with mode

In [6]:
most_frequent_embarked = max(dict(train_dataset.Embarked.value_counts()))
# for training data
filling_indices = [x for x in range(len(X_train)) if X_train[x, -1] != 'S' and X_train[x, -1] != 'Q' and X_train[x, -1] != 'C']
X_train[filling_indices, -1] = most_frequent_embarked 

# for testing data
filling_indices = [x for x in range(len(X_test)) if X_test[x, -1] != 'S' and X_test[x, -1] != 'Q' and X_test[x, -1] != 'C']
X_test[filling_indices, -1] = most_frequent_embarked

## Filling missing age values with mean age

In [7]:
imputer_age = SimpleImputer(strategy='mean')
X_train[:, [2]] = imputer_age.fit_transform(X_train[:, [2]])
X_test[:, [2]] = imputer_age.transform(X_test[:, [2]])

## Filling missing fare values with mean

In [8]:
imputer_fare = SimpleImputer(strategy='mean')
X_train[:, [5]] = imputer_fare.fit_transform(X_train[:, [5]])
X_test[:, [5]] = imputer_fare.transform(X_test[:, [5]])

## OneHot encoding passenger class

In [9]:
ct_pclass = ColumnTransformer([('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],remainder='passthrough')
X_train = ct_pclass.fit_transform(X_train)
X_test = ct_pclass.transform(X_test)

## Skipping dummy variable trap

In [10]:
X_train = X_train[:, 1:]
X_test = X_test[:, 1:]

## Converting embarked location to sparse matrix

In [11]:
embarked_encoder = LabelEncoder()
X_train[:, -1] = embarked_encoder.fit_transform(X_train[:, -1])
X_test[:, -1] = embarked_encoder.transform(X_test[:, -1])

## Applying z-score normalization to age

In [12]:
sc_age = StandardScaler()
X_train[:, [5]] = sc_age.fit_transform(X_train[:, [5]])
X_test[:, [5]] = sc_age.transform(X_test[:, [5]])

## Applying z-score normalization to fare

In [13]:
sc_fare = StandardScaler()
X_train[:, [-1]] = sc_fare.fit_transform(X_train[:, [-1]])
X_test[:, [-1]] = sc_fare.transform(X_test[:, [-1]])

## Applying PCA for feature reduction

In [14]:
pca = PCA(n_components=8)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# Applying NAIVE BAYES CLASSIFIER to predict survival

In [15]:
bayes_classifier = GaussianNB()
bayes_classifier.fit(X_train[10:], y_train[10:])

GaussianNB()

# Model's predictions

In [21]:
bayes_predictions = bayes_classifier.predict(X_train[:10])

print(bayes_predictions)
print(y_train[:10])

[0 1 1 1 0 0 0 0 0 1]
[0 1 1 1 0 0 0 0 1 1]


In [17]:
#print("Not survived : {}  ".format(sum(bayes_predictions==0)),
#      "Survived : {}".format(sum(bayes_predictions==1)))

Not survived : 249   Survived : 169
