In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('titanic_naive.csv')
df.head(10)

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0
5,6,"Moran, Mr. James",3,male,,0,0,330877,8.4583,,Q,0
6,7,"McCarthy, Mr. Timothy J",1,male,54.0,0,0,17463,51.8625,E46,S,0
7,8,"Palsson, Master. Gosta Leonard",3,male,2.0,3,1,349909,21.075,,S,0
8,9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",3,female,27.0,0,2,347742,11.1333,,S,1
9,10,"Nasser, Mrs. Nicholas (Adele Achem)",2,female,14.0,1,0,237736,30.0708,,C,1


In [3]:
df.drop(['PassengerId', 'Name', 'Ticket', 'Embarked'], axis='columns', inplace=True)

In [4]:
df.drop(['Cabin'], axis='columns', inplace=True)
df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,3,male,22.0,1,0,7.2500,0
1,1,female,38.0,1,0,71.2833,1
2,3,female,26.0,0,0,7.9250,1
3,1,female,35.0,1,0,53.1000,1
4,3,male,35.0,0,0,8.0500,0
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,0
887,1,female,19.0,0,0,30.0000,1
888,3,female,,1,2,23.4500,0
889,1,male,26.0,0,0,30.0000,1


In [5]:
target = df.Survived
inputs = df.drop(['Survived'], axis='columns')

In [6]:
inputs.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.25
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.925
3,1,female,35.0,1,0,53.1
4,3,male,35.0,0,0,8.05


In [7]:
dummies = pd.get_dummies(inputs.Sex)
dummies.head(3)

Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False


In [8]:
inputs = pd.concat([inputs, dummies], axis='columns')

In [9]:
inputs.drop(['Sex', 'male'], axis='columns', inplace=True)

In [10]:
inputs

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female
0,3,22.0,1,0,7.2500,False
1,1,38.0,1,0,71.2833,True
2,3,26.0,0,0,7.9250,True
3,1,35.0,1,0,53.1000,True
4,3,35.0,0,0,8.0500,False
...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,False
887,1,19.0,0,0,30.0000,True
888,3,,1,2,23.4500,True
889,1,26.0,0,0,30.0000,False


In [11]:
inputs.female = inputs.female.map({False: 0, True: 1})
inputs

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female
0,3,22.0,1,0,7.2500,0
1,1,38.0,1,0,71.2833,1
2,3,26.0,0,0,7.9250,1
3,1,35.0,1,0,53.1000,1
4,3,35.0,0,0,8.0500,0
...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0
887,1,19.0,0,0,30.0000,1
888,3,,1,2,23.4500,1
889,1,26.0,0,0,30.0000,0


In [12]:
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [13]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female
0,3,22.0,1,0,7.25,0
1,1,38.0,1,0,71.2833,1
2,3,26.0,0,0,7.925,1
3,1,35.0,1,0,53.1,1
4,3,35.0,0,0,8.05,0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.5)

In [15]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [16]:
model.fit(X_train,y_train)

In [17]:
model.score(X_test,y_test)

0.7869955156950673

In [18]:
model.predict(X_test[0:10])

array([0, 1, 0, 0, 0, 1, 1, 1, 0, 0], dtype=int64)

In [19]:
model.predict_proba(X_test[:10])

array([[0.56561982, 0.43438018],
       [0.1343828 , 0.8656172 ],
       [0.97070681, 0.02929319],
       [0.94298492, 0.05701508],
       [0.9699865 , 0.0300135 ],
       [0.00675644, 0.99324356],
       [0.48889448, 0.51110552],
       [0.49455063, 0.50544937],
       [0.98092454, 0.01907546],
       [0.98822334, 0.01177666]])

In [20]:
from sklearn.model_selection import cross_val_score
cross_val_score(GaussianNB(),X_train, y_train, cv=5)

array([0.75280899, 0.80898876, 0.79775281, 0.82022472, 0.75280899])