In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

In [22]:
# Read the .data file into a DataFrame
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
           'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

data = pd.read_csv("adult.data", names=columns, na_values=' ?', skipinitialspace=True)

# Similarly, read the .test file if you want to evaluate your model on a separate test set
test_data = pd.read_csv("adult.test", names=columns, na_values=' ?', skipinitialspace=True, skiprows=1)  # skiprows=1 to skip the first row which contains a special character

In [23]:
# Data preprocessing
# Drop rows with missing values
data.dropna(inplace=True)
test_data.dropna(inplace=True)

# Convert categorical variables to dummy variables
data = pd.get_dummies(data, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'])
test_data = pd.get_dummies(test_data, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'])

In [24]:
# Separate features and target variable
X = data.drop('income', axis=1)
y = data['income']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:
# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the classifier
nb_classifier.fit(X_train, y_train)

# Predictions on the test set
y_pred = nb_classifier.predict(X_test)

# Compute Sensitivity and Specificity
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

In [26]:
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)

# Compute posterior probability for test data
posterior_prob = nb_classifier.predict_proba(X_test)

# The first column of posterior_prob will contain the probability of belonging to class 0 (<=50K)
# The second column will contain the probability of belonging to class 1 (>50K)

# Let's print posterior probability for the first few samples in the test data
print("Posterior probability of making over 50K a year for the first few samples in the test data:")
for i in range(5):
    print("Sample {}: {:.4f}".format(i+1, posterior_prob[i, 1]))


Sensitivity: 0.32017823042647997
Specificity: 0.9514366653176851
Posterior probability of making over 50K a year for the first few samples in the test data:
Sample 1: 0.0043
Sample 2: 0.0138
Sample 3: 0.0171
Sample 4: 0.0087
Sample 5: 0.0772
