In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# First use the original df to split the data
original_df = pd.read_csv('Data.csv')

# Drop the rows with missing values
original_df = original_df.dropna()

# Covnert the float64 columns to int64 in 'hoursperweek'
original_df['hoursperweek'] = original_df['hoursperweek'].astype('int64')

# Create capitalloss_binary and capitalgain_binary columns by assigning 1 if the value is greater than 0, otherwise 0
original_df['capitalloss'] = original_df['capitalloss'].apply(lambda x: 1 if x > 0 else 0)
original_df['capitalgain'] = original_df['capitalgain'].apply(lambda x: 1 if x > 0 else 0)

# Only covert the 'Possibility' column to binary values
original_df['Possibility'] = original_df['Possibility'].map({'<=0.5': 1, '>0.5': 0})

# Perform binary encoding on the 'sex' column
original_df['sex'] = original_df['sex'].map({'Male':1, 'Female':0})

# Perform label encoding on the 'education' column
original_df['education'] = original_df['education'].map({
    'Preschool': 0, 
    '1st-4th': 1,
    '5th-6th': 1,
    '7th-8th': 1,
    '9th':1,
    '10th': 2,
    '11th': 2,
    '12th': 2,
    'HS-grad': 3,
    'Some-college': 4,
    'Assoc-acdm': 5,
    'Assoc-voc': 5,
    'Bachelors': 6,
    'Masters': 7,
    'Doctorate': 8,
    'Prof-school': 8})

# Define custom categories based on the analysis
original_df['maritalstatus'] = original_df['maritalstatus'].map({
    'Married-AF-spouse': 'Married',
    'Married-civ-spouse': 'Married',
    'Divorced': 'Divorced-Widowed-Abs',
    'Widowed': 'Divorced-Widowed-Abs',
    'Married-spouse-absent': 'Divorced-Widowed-Abs',
    'Separated': 'Separated-Never-Married',
    'Never-married': 'Separated-Never-Married'
})



# Conver the categorical columns to one-hot encoding
original_df = pd.get_dummies(original_df, columns=['workclass', 'maritalstatus', 'relationship',
       'race'])

print(original_df.columns)
print(original_df.head())

Index(['age', 'education', 'educationno', 'occupation', 'sex', 'capitalgain',
       'capitalloss', 'hoursperweek', 'native', 'Possibility',
       'workclass_Federal-gov', 'workclass_Local-gov', 'workclass_Private',
       'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc',
       'workclass_State-gov', 'workclass_Without-pay',
       'maritalstatus_Divorced-Widowed-Abs', 'maritalstatus_Married',
       'maritalstatus_Separated-Never-Married', 'relationship_Husband',
       'relationship_Not-in-family', 'relationship_Other-relative',
       'relationship_Own-child', 'relationship_Unmarried', 'relationship_Wife',
       'race_Amer-Indian-Eskimo', 'race_Asian-Pac-Islander', 'race_Black',
       'race_Other', 'race_White'],
      dtype='object')
   age  education  educationno         occupation  sex  capitalgain  \
0   39          6           13       Adm-clerical    1            1   
1   50          6           13    Exec-managerial    1            0   
2   38          3            

In [2]:
import numpy as np
import pandas as pd
from math import sqrt, pi, exp
from collections import defaultdict

# Assume `original_df` is already loaded

# Define the features and the target variables
X = original_df.drop(['Possibility','native','occupation'], axis=1)  
y = original_df['Possibility']

# Split the data into training and testing sets
def train_test_split(X, y, test_size=0.2):
    # Combining X and y for shuffling and splitting
    df_combined = pd.concat([X, y], axis=1)
    df_shuffled = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
    
    split_index = int((1 - test_size) * len(df_shuffled))
    train_data = df_shuffled[:split_index]
    test_data = df_shuffled[split_index:]
    
    X_train = train_data.drop('Possibility', axis=1)
    y_train = train_data['Possibility']
    X_test = test_data.drop('Possibility', axis=1)
    y_test = test_data['Possibility']
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [3]:
# Calculate the mean and variance for each feature by class
def calculate_mean_variance_by_class(X_train, y_train):
    summaries = defaultdict(dict)
    
    # Separate the data by class (0: Innocent, 1: Criminal)
    class_labels = np.unique(y_train)
    for class_value in class_labels:
        X_class = X_train[y_train == class_value]
        summaries[class_value] = {
            col: (X_class[col].mean(), X_class[col].var()) for col in X_train.columns
        }
    return summaries

# Calculate prior probabilities
def calculate_prior_probabilities(y_train):
    class_labels, class_counts = np.unique(y_train, return_counts=True)
    total_samples = len(y_train)
    return {class_value: count / total_samples for class_value, count in zip(class_labels, class_counts)}

# Calculate Gaussian Probability Density
def gaussian_probability(x, mean, var):
    if var == 0:  # To avoid division by zero
        var = 1e-4
    exponent = exp(-((x - mean)**2 / (2 * var)))
    return (1 / sqrt(2 * pi * var)) * exponent

# Summarize the training set
class_summaries = calculate_mean_variance_by_class(X_train, y_train)
priors = calculate_prior_probabilities(y_train)


In [4]:
# Calculate the class probabilities for a given input sample
def calculate_class_probabilities(summaries, input_data, priors):
    probabilities = {}
    
    for class_value, class_summary in summaries.items():
        probabilities[class_value] = priors[class_value]  # Start with the prior probability
        
        for feature, value in input_data.items():
            mean, var = class_summary[feature]
            probabilities[class_value] *= gaussian_probability(value, mean, var)
    
    return probabilities

# Predict class for a single data point
def predict(summaries, input_data, priors):
    probabilities = calculate_class_probabilities(summaries, input_data, priors)
    # Return the class with the highest probability
    return max(probabilities, key=probabilities.get)

# Predict for the entire test set
def predict_all(summaries, X_test, priors):
    predictions = []
    for _, row in X_test.iterrows():
        result = predict(summaries, row, priors)
        predictions.append(result)
    return np.array(predictions)

# Get predictions for the test set
y_pred = predict_all(class_summaries, X_test, priors)


In [5]:
# Function to compute precision, recall, and F1 score
def precision_recall_f1(y_true, y_pred):
    # Convert inputs to numpy arrays for element-wise comparison
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    # True Positives (TP): Correctly predicted positive instances
    TP = np.sum((y_true == 1) & (y_pred == 1))
    
    # False Positives (FP): Negative instances predicted as positive
    FP = np.sum((y_true == 0) & (y_pred == 1))
    
    # False Negatives (FN): Positive instances predicted as negative
    FN = np.sum((y_true == 1) & (y_pred == 0))
    
    # Precision: TP / (TP + FP)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    
    # Recall: TP / (TP + FN)
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    
    # F1 Score: 2 * (Precision * Recall) / (Precision + Recall)
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score

In [7]:
# Accuracy calculation
def accuracy(y_true, y_pred):
    correct = sum(y_true == y_pred)
    return correct / len(y_true)

# Evaluate accuracy on the test set
test_accuracy = accuracy(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy :.2f}")

# Calculate precision, recall, and F1 score on the test set
precision, recall, f1 = precision_recall_f1(y_test, y_pred)

# Output the results
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Test Accuracy: 0.71
Precision: 0.94
Recall: 0.66
F1 Score: 0.78
