In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline    

import random
from pprint import pprint
from time import time

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yasserh/titanic-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\HP\.cache\kagglehub\datasets\yasserh\titanic-dataset\versions\1


In [3]:
df=pd.read_csv("Titanic-Dataset.csv")

### Determining type of feature

In [4]:
def determine_type_of_feature(df):
    feature_types = []
    n_unique_values_threshold = 10
    
    for column in df.columns:
        unique_values = df[column].unique()
        example_value    = unique_values[0]
        
        if (isinstance(example_value,str)) or (len(unique_values) <= n_unique_values_threshold):
            feature_types.append("categorical")
            
        else:
            feature_types.append("continuous")
    
    return feature_types

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
feature_types = determine_type_of_feature(df)
i=0
for column in df.columns:
    print(column, "-", feature_types[i])
    i += 1

PassengerId - continuous
Survived - categorical
Pclass - categorical
Name - categorical
Sex - categorical
Age - continuous
SibSp - categorical
Parch - categorical
Ticket - categorical
Fare - continuous
Cabin - continuous
Embarked - categorical


## Train-Test_Split

In [7]:
def train_test_split(df,test_size):
    if isinstance(test_size,float):
        test_size = round(test_size * len(df))

    indicies = df.index.tolist()
    test_indicies = random.sample(population=indicies,k=test_size)

    test_df = df.loc[test_indicies]
    train_df =df.drop(test_indicies)
    
    return train_df, test_df

In [8]:
random.seed(0)    #so that our split remain same every time we run 
train_df,test_df =train_test_split(df,test_size=0.2)


# Naive Bayes

### For Titanic Data Lookup Table method

In [9]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [10]:
test_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
864,865,0,2,"Gill, Mr. John William",male,24.0,0,0,233866,13.0,,S
394,395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengt...",female,24.0,0,2,PP 9549,16.7,G6,S
776,777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q
430,431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",male,28.0,0,0,110564,26.55,C52,S
41,42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann ...",female,27.0,1,0,11668,21.0,,S


I am using lookup table method for this as we know for lookup table all data must have to be categorical but here "Age" and "Fare" column is numerical and "Embarked", "Sex" is character, string we have to deal with it for "Fare" column I am simply going to drop it but for remianing of these I am going to convert it in pure categorical form 

In [11]:
def create_age_groups(age):
    if age <= 12:
        return "Child"
    if 12 < age <= 19:
        return "Teenager"
    if 19 < age:
        return "Adult"
    else:
        return "Unknown"

In [12]:
def prepare_data(df_NB, train_set=True):
    
    # create new feature
    df_NB["Age_Group"] = df_NB.Age.apply(create_age_groups)
    
    # drop features that we are not going to use
    df_NB.drop(["Fare"], axis=1, inplace=True)
    
    # rename column "Parch" to "ParCh"
    df_NB.rename({"Parch": "ParCh"}, axis=1, inplace=True)
    
    # rearange order of columns
    if train_set:
        df_NB = df_NB[["Sex", "Pclass", "Age_Group", "Embarked", "SibSp", "ParCh", "Survived"]]
    else:
        df_NB = df_NB[["Sex", "Pclass", "Age_Group", "Embarked", "SibSp", "ParCh"]]
    
    return df_NB

In [13]:
def replace_strings(df_NB):
    
    df_NB.Age_Group.replace({"Adult": 0, "Unknown": 1, "Teenager": 2, "Child": 3}, inplace=True)
    df_NB.Embarked.replace({"S": 0, "C": 1, "Q": 2}, inplace=True)
    df_NB.Sex.replace({"male": 0, "female": 1}, inplace=True)

    return df_NB

In [14]:
#copying previous data
df_train = train_df
df_test = test_df

In [15]:
#preparing data for method
df_train = prepare_data(df_train)
df_test = prepare_data(df_test, train_set=False)

df_train.head()

Unnamed: 0,Sex,Pclass,Age_Group,Embarked,SibSp,ParCh,Survived
0,male,3,Adult,S,1,0,0
2,female,3,Adult,S,0,0,1
3,female,1,Adult,S,1,0,1
4,male,3,Adult,S,0,0,0
5,male,3,Unknown,Q,0,0,0


In [16]:
df_test.head()

Unnamed: 0,Sex,Pclass,Age_Group,Embarked,SibSp,ParCh
864,male,2,Adult,S,0,0
394,female,3,Adult,S,0,2
776,male,3,Unknown,Q,0,0
430,male,1,Adult,S,0,0
41,female,2,Adult,S,1,0


In [24]:
def create_table(df_ct, label_column):
    table ={}

    #determine values for the label
    counts = df_ct[label_column].value_counts().sort_index()
    # pprint(counts)
    table["class_names"] = counts.index.to_numpy()
    table["class_counts"] = counts.values
    # pprint(table)
    #determining probabilities for the features
    for feature in df_ct.drop(label_column, axis=1).columns:
        table[feature] = {}
        counts = df_ct.groupby(label_column)[feature].value_counts()
        # pprint(counts)
        df_ct_counts = counts.unstack(label_column)
        # pprint(df_ct_counts)

        #check for "problem of rare values"
        if df_ct_counts.isna().any(axis=None):
            df_ct_counts.fillna(value=0, inplace=True)
            df_ct_counts += 1


        df_ct_probabilities = df_ct_counts / df_ct_counts.sum()
        # pprint(df_ct_probabilities)   
        print(df_ct_probabilities.index)
        for value in df_ct_probabilities.index:
            probabilities = df_ct_probabilities.loc[value].to_numpy()
            table[feature][value] = probabilities
    
    return table

In [25]:
t0=time()
lookup_table = create_table(df_train, label_column="Survived")
t1=time()
print('Train Time %f'%(t1-t0))
pprint(lookup_table)

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000025F28C3BD90>
<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000025F4AFEE520>
<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000025F4B0030A0>
<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000025F4B003C10>
<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000025F4AFEE070>
<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000025F4AFEE490>
Train Time 0.057363
{'Age_Group': {'Adult': array([0.61678005, 0.59558824]),
               'Child': array([0.04988662, 0.12867647]),
               'Teenager': array([0.09977324, 0.12132353]),
               'Unknown': array([0.23356009, 0.15441176])},
 'Embarked': {'C': array([0.13605442, 0.24354244]),
              'Q': array([0.08390023, 0.09225092]),
              'S': array([0.78004535, 0.66420664])},
 'ParCh': {0: array([0.81026786, 0.64874552]),
           1: array([0.09598214, 0.19713262]),
           2: array([0.06473214, 0.12903226]),
          

In [19]:
def predict_example(row, lookup_table):
    class_estimates = lookup_table["class_counts"]
    for feature in row.index:

        try:
            value = row[feature]
            probabilities = lookup_table[feature][value]
            class_estimates = class_estimates * probabilities

        # skip in case "value" only occurs in test set but not in train set
        # (i.e. "value" is not in "lookup_table")
        except KeyError:
            continue

    index_max_class = class_estimates.argmax()
    prediction = lookup_table["class_names"][index_max_class]
    
    return prediction

In [20]:
t0=time()
predictions = df_test.apply(predict_example, axis=1, args=(lookup_table,))
t1=time()
print('Test Time %f'%(t1-t0))
predictions.head()

Test Time 0.042441


864    0
394    1
776    0
430    0
41     1
dtype: int64

### Check Accuracy

In [21]:
predictions_correct = predictions == test_df.Survived
accuracy = predictions_correct.mean()
print(f"Accuracy: {accuracy:.3f}")

Accuracy: 0.758
