In [1]:
# Step 1: Import the Pandas library
import pandas as pd  # For handling datasets

# Placeholder: Load your dataset (Update the file path)
# df = pd.read_csv("path_to_your_dataset.csv")

# Display the first few rows to verify data structure (Uncomment after loading the dataset)
# print(df.head())

In [2]:
# Step 1: Load the Titanic dataset

df = pd.read_csv("Datasets/titanic.csv")  # Load dataset

# Step 2: Display first few rows to verify data structure
print("Dataset Preview:\n", df.head())

Dataset Preview:
    survived  pclass  gender   age  sibsp  parch     fare embarked deck
0         0       3    male  22.0      1      0   7.2500        S  NaN
1         1       1  female  38.0      1      0  71.2833        C    C
2         1       3  female  26.0      0      0   7.9250        S  NaN
3         1       1  female  35.0      1      0  53.1000        S    C
4         0       3    male  35.0      0      0   8.0500        S  NaN


In [3]:
# Step 2: Drop rows with missing values in critical columns

df = df.dropna(subset=['gender', 'pclass', 'survived'])  # Remove rows with NaN in specified columns

# Display dataset shape after removal for verification
print(f"Dataset shape after dropping missing values: {df.shape}")

Dataset shape after dropping missing values: (891, 9)


In [4]:
# Step 3: Compute total number of records in the dataset

total = len(df)  # Get the number of rows (records) in the DataFrame

# Print the total count for verification
print(f"Total records in the dataset: {total}")

Total records in the dataset: 891


In [5]:
# Step 4: Compute Prior Probabilities for Survival Outcomes

# Probability of survival (P(survived=1))
p_survived_1 = len(df[df['survived'] == 1]) / total  

# Probability of not surviving (P(survived=0))
p_survived_0 = len(df[df['survived'] == 0]) / total  

# Print probabilities for verification
print(f"Prior Probability of Survival (P(survived=1)): {p_survived_1:.4f}")
print(f"Prior Probability of Not Surviving (P(survived=0)): {p_survived_0:.4f}")

Prior Probability of Survival (P(survived=1)): 0.3838
Prior Probability of Not Surviving (P(survived=0)): 0.6162


In [6]:
# Step 4: Compute Conditional Probability P(gender = 'female' | survived = 1)

# Count the number of females who survived
num_female_survived = len(df[(df['gender'] == 'female') & (df['survived'] == 1)])

# Count the total number of survivors
num_survived = len(df[df['survived'] == 1])

# Compute conditional probability
p_female_given_1 = num_female_survived / num_survived

# Print result
print(f"P(gender = 'female' | survived = 1): {p_female_given_1:.4f}")

P(gender = 'female' | survived = 1): 0.6813


In [7]:
# Step 5: Compute Conditional Probability P(pclass = 2 | survived = 1)

# Probability that a passenger was in 2nd class given they survived
p_pclass2_given_1 = len(df[(df['pclass'] == 2) & (df['survived'] == 1)]) / len(df[df['survived'] == 1])

# Print computed probability for verification
print(f"P(pclass = 2 | survived = 1): {p_pclass2_given_1:.4f}")

P(pclass = 2 | survived = 1): 0.2544


In [8]:
# Step 5: Compute Conditional Probability P(gender = 'female' | survived = 0)

# Count the number of females who did not survive
num_female_not_survived = len(df[(df['gender'] == 'female') & (df['survived'] == 0)])

# Count the total number of non-survivors
num_not_survived = len(df[df['survived'] == 0])

# Compute conditional probability
p_female_given_0 = num_female_not_survived / num_not_survived

# Print result
print(f"P(gender = 'female' | survived = 0): {p_female_given_0:.4f}")

P(gender = 'female' | survived = 0): 0.1475


In [9]:
# Step 6: Compute Conditional Probability P(pclass = 2 | survived = 0)

# Count the number of non-survivors in second-class (pclass = 2)
num_pclass2_not_survived = len(df[(df['pclass'] == 2) & (df['survived'] == 0)])

# Count the total number of non-survivors
num_not_survived = len(df[df['survived'] == 0])

# Compute conditional probability
p_pclass2_given_0 = num_pclass2_not_survived / num_not_survived

# Print result
print(f"P(pclass = 2 | survived = 0): {p_pclass2_given_0:.4f}")

P(pclass = 2 | survived = 0): 0.1767


In [10]:
# Step 7: Compute Naïve Bayes Probability P(x | Survived = 1) * P(Survived = 1)

# Define prior probability: P(Survived = 1)
p_survived_1 = len(df[df['survived'] == 1]) / total

# Compute likelihood terms: P(x | Survived = 1)
px_given_1 = p_female_given_1 * p_pclass2_given_1 * p_survived_1

# Print result
print(f"P(x | Survived = 1) * P(Survived = 1): {px_given_1:.4f}")

P(x | Survived = 1) * P(Survived = 1): 0.0665


In [11]:
# Step 8: Compute Naïve Bayes Probability P(x | Survived = 0) * P(Survived = 0)

# Define prior probability: P(Survived = 0)
p_survived_0 = len(df[df['survived'] == 0]) / total  # Probability of not surviving

# Compute likelihood terms: P(x | Survived = 0)
px_given_0 = p_female_given_0 * p_pclass2_given_0 * p_survived_0

# Print result
print(f"P(x | Survived = 0) * P(Survived = 0): {px_given_0:.4f}")

P(x | Survived = 0) * P(Survived = 0): 0.0161


In [12]:
# Step 9: Normalize Probabilities to Ensure They Sum to 1

# Compute total probability (denominator for normalization)
total_prob = px_given_1 + px_given_0  # Sum of both probabilities

# Compute final probabilities (posterior probabilities)
final_survived_1 = px_given_1 / total_prob  # Probability of survival
final_survived_0 = px_given_0 / total_prob  # Probability of non-survival

# Print final normalized probabilities
print(f"P(Survived = 1 | x): {final_survived_1:.4f}")  # Probability of survival
print(f"P(Survived = 0 | x): {final_survived_0:.4f}")  # Probability of non-survival

P(Survived = 1 | x): 0.8055
P(Survived = 0 | x): 0.1945


In [13]:
# Step 10: Display Final Posterior Probabilities

# Print normalized survival probabilities
print(f"P(Survived = 1 | x) = {final_survived_1:.4f}")  # Probability of survival given features
print(f"P(Survived = 0 | x) = {final_survived_0:.4f}")  # Probability of non-survival given features

P(Survived = 1 | x) = 0.8055
P(Survived = 0 | x) = 0.1945
