In [87]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,recall_score,f1_score,precision_score

# load  Data

In [88]:
url_train="https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
url_test="https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"

column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
train_data=pd.read_csv(url_train,names=column_names)
test_data=pd.read_csv(url_test,names=column_names)
df=pd.concat([train_data,test_data])

df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
Index: 48843 entries, 0 to 16281
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             48843 non-null  object 
 1   workclass       48842 non-null  object 
 2   fnlwgt          48842 non-null  float64
 3   education       48842 non-null  object 
 4   education-num   48842 non-null  float64
 5   marital-status  48842 non-null  object 
 6   occupation      48842 non-null  object 
 7   relationship    48842 non-null  object 
 8   race            48842 non-null  object 
 9   sex             48842 non-null  object 
 10  capital-gain    48842 non-null  float64
 11  capital-loss    48842 non-null  float64
 12  hours-per-week  48842 non-null  float64
 13  native-country  48842 non-null  object 
 14  income          48842 non-null  object 
dtypes: float64(5), object(10)
memory usage: 6.0+ MB


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


# Checking Missing Values on train Data

In [89]:
df.isnull().sum()

age               0
workclass         1
fnlwgt            1
education         1
education-num     1
marital-status    1
occupation        1
relationship      1
race              1
sex               1
capital-gain      1
capital-loss      1
hours-per-week    1
native-country    1
income            1
dtype: int64

## Handling Missing Values 

In [90]:
df.dropna(inplace=True)

## Handling duplicate values

In [91]:
df.drop_duplicates(inplace=True)

# determine numerical and categorical features

In [92]:
# Extract categorical features
categorical_features = df.select_dtypes('object')

# Drop categorical features from the original DataFrame
df_numeric = df.drop(categorical_features.columns, axis=1)

# Feature Scaling

In [93]:
scaler = MinMaxScaler()
model=scaler.fit(df_numeric)
scaled_data=model.transform(df_numeric)

# Encoding of categorical variables

In [94]:
print(df['income'].value_counts())

# Replace wrong values in the 'income' column
df['income'] = df['income'].str.replace('<=50K.', '<=50K')
df['income'] = df['income'].str.replace('>50K.', '>50K')

# Apply Encoding only to categorical columns
df_encoded = pd.get_dummies(categorical_features)

print(df['income'].value_counts())

# Combine encoded categorical features with numerical features
df = pd.concat([df_numeric, df_encoded], axis=1)

# Verify that there are no missing values after Encoding
print("Missing Values After Encoding:")
print(df.isnull().sum().sum())



income
<=50K     24698
<=50K.    12430
>50K       7839
>50K.      3846
Name: count, dtype: int64
income
<=50K    37128
>50K     11685
Name: count, dtype: int64
Missing Values After Encoding:
0


# Split Data

In [95]:
# Separate features (X) and target variable (y)
x = df.drop('income_ >50K', axis=1)
y = df['income_ >50K']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Training Model

In [96]:
# Train the Naive Bayes classifier
model = GaussianNB()
model.fit(X_train,y_train)

 # Predict on the test set

In [97]:
# Predict on the test set
y_pred = model.predict(X_test)
y_pred

array([False, False, False, ..., False, False, False])

# Compute confusion matrix

In [98]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

array([[11728,   528],
       [ 1937,   451]], dtype=int64)

# Compute Sensitivity and Specificity

In [99]:
TP, FN, FP, TN = conf_matrix.ravel()
sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)
print(f'sensitivity = {sensitivity}')
print(f'specificity = {specificity}')

sensitivity = 0.9569190600522193
specificity = 0.1888609715242881


# Compute posterior probability

In [100]:
# Compute posterior probability of making over 50k for each instance
posterior_probs = model.predict_proba(X_test)[:,1]
posterior_probs

array([0.02878199, 0.03312677, 0.03885895, ..., 0.06755034, 0.00599989,
       0.03632082])

## compute Accuracy

In [101]:
accuracy = (TP + TN) / (TP + TN + FP + FN)*100
print(f'Accuracy: {accuracy} %')

Accuracy: 83.16716744059 %


# compute recall 

In [102]:
recall =recall_score(y_pred,y_test)
print(f'Recall : {recall*100} %')

Recall : 46.06741573033708 %


# compute precision

In [103]:
precision=precision_score(y_pred,y_test)
print(f'Precision : {precision*100} %')

Precision : 18.88609715242881 %


# compute F1_Score

In [104]:
F1_score=f1_score(y_pred,y_test)
print(f'F1_Score : {F1_score*100} %')

F1_Score : 26.78942678942679 %
