# Prepare a classification model using Naive Bayes for salary data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB as MB
from sklearn.naive_bayes import GaussianNB as GB

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics 

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Reading the train data
naive_train = pd.read_csv('SalaryData_Train.csv')
naive_train.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# Read the test data
naive_test = pd.read_csv('SalaryData_Test.csv')
naive_test.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [4]:
# Understanding the basic info from the train data and test data
naive_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            30161 non-null  int64 
 1   workclass      30161 non-null  object
 2   education      30161 non-null  object
 3   educationno    30161 non-null  int64 
 4   maritalstatus  30161 non-null  object
 5   occupation     30161 non-null  object
 6   relationship   30161 non-null  object
 7   race           30161 non-null  object
 8   sex            30161 non-null  object
 9   capitalgain    30161 non-null  int64 
 10  capitalloss    30161 non-null  int64 
 11  hoursperweek   30161 non-null  int64 
 12  native         30161 non-null  object
 13  Salary         30161 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.2+ MB


In [5]:
naive_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15060 entries, 0 to 15059
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            15060 non-null  int64 
 1   workclass      15060 non-null  object
 2   education      15060 non-null  object
 3   educationno    15060 non-null  int64 
 4   maritalstatus  15060 non-null  object
 5   occupation     15060 non-null  object
 6   relationship   15060 non-null  object
 7   race           15060 non-null  object
 8   sex            15060 non-null  object
 9   capitalgain    15060 non-null  int64 
 10  capitalloss    15060 non-null  int64 
 11  hoursperweek   15060 non-null  int64 
 12  native         15060 non-null  object
 13  Salary         15060 non-null  object
dtypes: int64(5), object(9)
memory usage: 1.6+ MB


In [6]:
naive_test.duplicated().sum()

930

In [7]:
naive_train.duplicated().sum()

3258

In [8]:
naive1_train=naive_train.drop_duplicates()


In [9]:
naive1_test=naive_test.drop_duplicates()

In [10]:
# Label encoding - To convert categorical variables into numerical and fit it into the  train dataset
naive1_train = naive1_train.apply(LabelEncoder().fit_transform)
naive1_train.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,22,5,9,12,4,0,1,4,1,24,0,39,37,0
1,33,4,9,12,2,3,0,4,1,0,0,12,37,0
2,21,2,11,8,0,5,1,4,1,0,0,39,37,0
3,36,2,1,6,2,5,0,2,1,0,0,39,37,0
4,11,2,9,12,2,9,5,2,0,0,0,39,4,0


In [11]:
# Label encoding - for test data and fiiting the value in the test dataset
naive1_test =naive1_test.apply(LabelEncoder().fit_transform)
naive1_test.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,8,2,1,6,4,6,3,2,1,0,0,39,37,0
1,21,2,11,8,2,4,0,4,1,0,0,49,37,0
2,11,1,7,11,2,10,0,4,1,0,0,39,37,1
3,27,2,15,9,2,6,0,2,1,87,0,39,37,1
4,17,2,0,5,4,7,1,4,1,0,0,29,37,0


In [12]:
# Splitting the  training dataset into dependent and independent variables
X_train= naive1_train.drop(['education','relationship','native','maritalstatus','sex','race','Salary'],axis=1).values
Y_train= naive1_train['Salary'].values
print(np.unique(Y_train))


[0 1]


In [13]:
# Splitting the  testing dataset into dependent and independent variables
X_test=naive1_test.drop(['education','relationship','native','maritalstatus','sex','race','Salary'],axis=1).values
Y_test= naive1_test['Salary'].values
print(np.unique(Y_test))
X_test

[0 1]


array([[ 8,  2,  6, ...,  0,  0, 39],
       [21,  2,  8, ...,  0,  0, 49],
       [11,  1, 11, ...,  0,  0, 39],
       ...,
       [21,  2, 12, ...,  0,  0, 49],
       [27,  2, 12, ..., 73,  0, 39],
       [18,  3, 12, ...,  0,  0, 59]])

## Gaussian Naive Bayes

In [14]:
# Model fitting
gnb = GB() 
gnb.fit(X_train, Y_train) 

GaussianNB()

In [15]:
# Accuracy score
gnb.score(X_train, Y_train)

0.786380701037059

In [16]:
#  Model prediction -Test data
y_pred = gnb.predict(X_test) 

In [17]:
accuracy_test = np.mean(y_pred== Y_test)
accuracy_test

0.7909412597310687

In [18]:
### Confusion matrix
cm= confusion_matrix(Y_test, y_pred) 
cm

array([[10058,   562],
       [ 2392,  1118]])

In [19]:
 # checking the accuracy score from predicted and actual values from test data
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(Y_test, y_pred)*100)

Gaussian Naive Bayes model accuracy(in %): 79.09412597310687


## Multinomial Naive Bayes

In [20]:
# Accuracy score -train data

classifier_mb = MB()
classifier_mb.fit(X_train, Y_train)
classifier_mb.score(X_train, Y_train)
classifier_mb.score(X_test, Y_test)
predicted_result = classifier_mb.predict(X_train)
accuracy_train = np.mean(predicted_result == Y_train)
accuracy_train

0.7749321636992157

In [21]:
# Accuracy score -test data
test_predict=classifier_mb.predict(X_test)
accuracy_test_1 = np.mean(test_predict== Y_test)
accuracy_test_1

0.7789101203113942

In [22]:
# unifying the obseved info for better comparison

t={'model':['Gaussian NB','Multinomial NB'],
  'accuracy':[accuracy_test,accuracy_test_1]
  }
t=pd.DataFrame(t)
t

Unnamed: 0,model,accuracy
0,Gaussian NB,0.790941
1,Multinomial NB,0.77891


 In Naive Bayes ,both gaussian and multinomial naive bayes are used for checking the best classifier among them for salary . it is clear from the above table that gaussian is giving high score of 79.09% accuracy.