In [1]:
import pandas as pd
from scipy import stats
import matplotlib.pylab as plt
import seaborn as sns
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.read_csv('Titanic-Dataset.csv')

In [3]:
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
# Check if each column has missing values
null_count_per_column = data.isnull().sum()
print(null_count_per_column)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [5]:
# Use the KNNImputer method to fill in the empty values of the Age column
imputer = KNNImputer(n_neighbors=5)
feature = data[['Age']]
feature_filled = imputer.fit_transform(feature)
data['Age'] = feature_filled

In [6]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
# Delete rows with null values in Embarked column
data.dropna(subset=['Embarked'],inplace=True)

In [10]:
# Check whether the Pclass column is highly correlated with the Fare column

# Use get_dummies to convert categorical data into numerical data
df_encoded = pd.get_dummies(data['Pclass'],prefix='Pclass')
df_combined = pd.concat([df_encoded, data['Fare']], axis=1)

# Calculate the correlation matrix
correlation_matrix = df_combined.corr()
print(correlation_matrix)

          Pclass_1  Pclass_2  Pclass_3      Fare
Pclass_1  1.000000 -0.287653 -0.625395  0.590576
Pclass_2 -0.287653  1.000000 -0.567432 -0.117609
Pclass_3 -0.625395 -0.567432  1.000000 -0.411932
Fare      0.590576 -0.117609 -0.411932  1.000000


In [11]:
# There is a moderate correlation, which is not suitable for use in the Naïve Bayes Classifier.
# It is decided not to use the Pclass column, but to categorize the Fare column and use it.

# Bin the Fare column using quartiles
data['Fare'], bins1 = pd.qcut(data['Fare'], q=[0, .25, .5, .75, 1.], labels=['Low', 'Medium', 'High', 'Very High'], retbins=True)

In [12]:
bins1

array([  0.    ,   7.8958,  14.4542,  31.    , 512.3292])

In [13]:
# Age is divided into four categories: 'Child', 'Young Adults', 'Middle Age', 'Old Age'
bins2 = [0, 15, 50, 70, np.inf]
labels = ['Child', 'Young Adults', 'Middle Age', 'Old Age']
data['Age'] = pd.cut(data['Age'], bins=bins2, labels=labels)

In [14]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,Young Adults,1,0,A/5 21171,Low,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,Young Adults,1,0,PC 17599,Very High,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,Young Adults,0,0,STON/O2. 3101282,Medium,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,Young Adults,1,0,113803,Very High,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,Young Adults,0,0,373450,Medium,,S


In [15]:
# Transform all numeric columns that will be used
data = data.astype({'Survived': 'category', 'SibSp': 'category', 'Parch': 'category'})

In [16]:
predictors = ['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
outcome = 'Survived'

In [17]:
X = pd.get_dummies(data[predictors])
y = data['Survived']

In [18]:
# Split the data into training (80%) and testing (20%)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# run naive Bayes
data_nb = MultinomialNB(alpha=0.1)
data_nb.fit(X_train, y_train)

In [20]:
y_valid_pred = data_nb.predict(X_valid)

In [21]:
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_valid, y_valid_pred))

Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       109
           1       0.73      0.74      0.73        69

    accuracy                           0.79       178
   macro avg       0.78      0.78      0.78       178
weighted avg       0.79      0.79      0.79       178



The prediction performance of 'Not Survived' is high, with precision, recall and F1 score of 0.83, indicating that the model is very accurate in predicting 'Not Survived'.
The prediction performance of 'Survived' is relatively low, but still has relatively high precision, recall and F1 score (about 0.73-0.74).
The overall accuracy is 0.79, indicating that the model can correctly classify samples in most cases.
The macro average and weighted average indicators show that the model performs well overall, especially the weighted average indicator shows that the model handles the problem of class imbalance well.