# Naive Bayes 

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score,cross_val_predict
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

In [10]:
#Creating Evaluation function
def Report(y_test,y_pred):
    print('Accuracy Score:',metrics.accuracy_score(y_test,y_pred))
    print('Precision Score:',metrics.precision_score(y_test,y_pred))
    print('Recall Score:',metrics.recall_score(y_test,y_pred))
    print('F1 Score:',metrics.f1_score(y_test,y_pred))
    print('Confusion Matrix:',metrics.confusion_matrix(y_test,y_pred))
    print('Classification Report:',metrics.classification_report(y_test,y_pred))

In [11]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [12]:
#Handling Training Missing Values
df.isna().sum()
df['Age'].fillna(df['Age'].mean(), axis=0,inplace=True)
df['Cabin'].fillna(df['Cabin'].value_counts().idxmax(),axis=0 ,inplace=True)
df.dropna(subset='Embarked',how='any',axis=0, inplace=True)
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [13]:
#Handling Test Missing Values
df_test.isna().sum()
df_test['Age'].fillna(df_test['Age'].mean(), axis=0,inplace=True)
df_test['Cabin'].fillna(df_test['Cabin'].value_counts().idxmax(),axis=0 ,inplace=True)
df_test['Fare'].fillna(df_test['Fare'].mean(), axis=0,inplace=True)
df_test.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [18]:
#Convert categorical variable into indicator variables
label_encoder = LabelEncoder()
#train data 
df['Sex'] = label_encoder.fit_transform(df['Sex'])
df['Embarked'] = label_encoder.fit_transform(df['Embarked'])
df['Cabin'] = label_encoder.fit_transform(df['Cabin'])
df['Ticket'] = label_encoder.fit_transform(df['Ticket'])

#test data 
df_test['Sex'] = label_encoder.fit_transform(df_test['Sex'])
df_test['Embarked'] = label_encoder.fit_transform(df_test['Embarked'])
df_test['Cabin'] = label_encoder.fit_transform(df_test['Cabin']) 
df_test['Ticket'] = label_encoder.fit_transform(df_test['Ticket']) 

In [19]:
#training data 

train_label = df['Survived']
train_data = df.drop(['Survived','Name','PassengerId'],axis=1)



#test data

test_data = df_test.drop(['Name','PassengerId'],axis=1)

In [20]:
gnb = GaussianNB()
gnb.fit(train_data,train_label)
gnb_pred = gnb.predict(test_data)

In [21]:
gnb_pred = pd.DataFrame(gnb_pred,index=df_test['PassengerId'],columns=['Survived'])

In [23]:
gnb_pred.value_counts()

Survived
0           231
1           187
Name: count, dtype: int64

## Evaluating Model by creating test split from train.csv

In [24]:
x_train,x_test,y_train,y_test = train_test_split(train_data,train_label,test_size=0.2,random_state=1)

In [25]:
gnbc = GaussianNB()
gnbc.fit(x_train,y_train)
y_pred = gnbc.predict(x_test)
Report(y_test,y_pred)

Accuracy Score: 0.7921348314606742
Precision Score: 0.7727272727272727
Recall Score: 0.6986301369863014
F1 Score: 0.7338129496402879
Confusion Matrix: [[90 15]
 [22 51]]
Classification Report:               precision    recall  f1-score   support

           0       0.80      0.86      0.83       105
           1       0.77      0.70      0.73        73

    accuracy                           0.79       178
   macro avg       0.79      0.78      0.78       178
weighted avg       0.79      0.79      0.79       178

