In [104]:
import pandas as pd
import numpy as np

In [119]:
# Load the dataset
url = 'adult/adult.data'
url2 = 'adult/adult.test'
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
data1 = pd.read_csv(url , names=column_names, na_values=' ?')
data2 = pd.read_csv(url2 , names=column_names, na_values=' ?')
data2 = data2.iloc[1:]

# Concatenate the two DataFrames
data = pd.concat([data1, data2], ignore_index=True)

# # Drop rows with missing values
# data = data.dropna()
print(data)


        age          workclass    fnlwgt   education  education-num  \
0       NaN                NaN   77516.0   Bachelors            NaN   
1      50.0   Self-emp-not-inc   83311.0   Bachelors           13.0   
2      38.0            Private  215646.0     HS-grad            9.0   
3      53.0            Private  234721.0        11th            7.0   
4      28.0            Private  338409.0   Bachelors           13.0   
...     ...                ...       ...         ...            ...   
48837    39            Private  215419.0   Bachelors           13.0   
48838    64                NaN  321403.0     HS-grad            9.0   
48839    38            Private  374983.0   Bachelors           13.0   
48840    44            Private   83891.0   Bachelors           13.0   
48841    35       Self-emp-inc  182148.0   Bachelors           13.0   

            marital-status          occupation     relationship  \
0            Never-married        Adm-clerical    Not-in-family   
1       Marri

In [118]:
# Check for duplicate rows
duplicate_rows = data[data.duplicated()]

# Print duplicate row count
print("Duplicate Row Count:", len(duplicate_rows))

# Print duplicate rows
# print("Duplicate Rows:")
# print(duplicate_rows)

# Remove duplicate rows
data.drop_duplicates(inplace=True)
data.head(2)

Duplicate Row Count: 29


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,,,77516.0,Bachelors,,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K


In [107]:
# Impute missing values with the most frequent value (mode) for categorical features
for col in ['workclass', 'occupation', 'native-country']:
    data[col].fillna(data[col].mode()[0], inplace=True)
    
    # Impute missing values with the median for numerical features # Impute missing values with the median for numerical features   
for col in ['education-num', 'age']:
    data[col].fillna(data[col].median(), inplace=True)

# Drop rows with missing values in other columns
data.dropna(inplace=True)
data.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,37.0,Private,77516,Bachelors,10.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50.0,Self-emp-not-inc,83311,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [108]:
category_counts = data.apply(lambda x: x.nunique())

print("Number of categories in each column:")
print(category_counts)

Number of categories in each column:
age                  73
workclass             8
fnlwgt            21648
education            16
education-num        16
marital-status        7
occupation           14
relationship          6
race                  5
sex                   2
capital-gain        119
capital-loss         92
hours-per-week       94
native-country       41
income                2
dtype: int64


In [109]:
from sklearn.preprocessing import LabelEncoder
# Encode categorical variables by replacing the categorical values with numerical labels
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])
    
# Split data into features and target
X = data.drop('income', axis=1)
y = data['income']

from sklearn.model_selection import train_test_split
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [110]:
from sklearn.naive_bayes import GaussianNB
# Step 2: Model Training
# Train Naïve Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

from sklearn.ensemble import RandomForestClassifier
# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [111]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
#Step 3: Model Evaluation
# Evaluate Naïve Bayes model
nb_pred = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)
print("Naïve Bayes Accuracy:", nb_accuracy)
print("Naïve Bayes Classification Report:\n", classification_report(y_test, nb_pred))
print("Naïve Bayes Confusion Matrix:\n", confusion_matrix(y_test, nb_pred))

Naïve Bayes Accuracy: 0.7913337430854334
Naïve Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.94      0.87      4905
           1       0.65      0.33      0.44      1603

    accuracy                           0.79      6508
   macro avg       0.73      0.64      0.65      6508
weighted avg       0.77      0.79      0.76      6508

Naïve Bayes Confusion Matrix:
 [[4625  280]
 [1078  525]]


In [112]:
# Evaluate Random Forest model
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print("\nRandom Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", classification_report(y_test, rf_pred))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))


Random Forest Accuracy: 0.8598647818070068
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.93      0.91      4905
           1       0.76      0.63      0.69      1603

    accuracy                           0.86      6508
   macro avg       0.82      0.78      0.80      6508
weighted avg       0.85      0.86      0.86      6508

Random Forest Confusion Matrix:
 [[4579  326]
 [ 586 1017]]


In [113]:
# Step 4: Model Comparison
print("\nModel Comparison:")
print("Naïve Bayes Accuracy:", nb_accuracy)
print("Random Forest Accuracy:", rf_accuracy)


Model Comparison:
Naïve Bayes Accuracy: 0.7913337430854334
Random Forest Accuracy: 0.8598647818070068
