### Bhagyashree Deshpande

# Prepare a classification model using Naive Bayes for salary data

## Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn import svm,metrics
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV,train_test_split,cross_val_score
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB as MB
from sklearn.naive_bayes import GaussianNB as GB

import warnings
warnings.filterwarnings('ignore')

## Import Data

In [2]:
salary_test = pd.read_csv('SalaryData_Test.csv')
salary_test.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [3]:
salary_train = pd.read_csv('SalaryData_Train.csv')
salary_train.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Data Understanding

In [4]:
salary_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            30161 non-null  int64 
 1   workclass      30161 non-null  object
 2   education      30161 non-null  object
 3   educationno    30161 non-null  int64 
 4   maritalstatus  30161 non-null  object
 5   occupation     30161 non-null  object
 6   relationship   30161 non-null  object
 7   race           30161 non-null  object
 8   sex            30161 non-null  object
 9   capitalgain    30161 non-null  int64 
 10  capitalloss    30161 non-null  int64 
 11  hoursperweek   30161 non-null  int64 
 12  native         30161 non-null  object
 13  Salary         30161 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.2+ MB


In [5]:
salary_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15060 entries, 0 to 15059
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            15060 non-null  int64 
 1   workclass      15060 non-null  object
 2   education      15060 non-null  object
 3   educationno    15060 non-null  int64 
 4   maritalstatus  15060 non-null  object
 5   occupation     15060 non-null  object
 6   relationship   15060 non-null  object
 7   race           15060 non-null  object
 8   sex            15060 non-null  object
 9   capitalgain    15060 non-null  int64 
 10  capitalloss    15060 non-null  int64 
 11  hoursperweek   15060 non-null  int64 
 12  native         15060 non-null  object
 13  Salary         15060 non-null  object
dtypes: int64(5), object(9)
memory usage: 1.6+ MB


In [6]:
salary_train.describe(include = 'all')

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
count,30161.0,30161,30161,30161.0,30161,30161,30161,30161,30161,30161.0,30161.0,30161.0,30161,30161
unique,,7,16,,7,14,6,5,2,,,,40,2
top,,Private,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22285,9840,,14065,4038,12463,25932,20380,,,,27504,22653
mean,38.438115,,,10.121316,,,,,,1092.044064,88.302311,40.931269,,
std,13.13483,,,2.550037,,,,,,7406.466611,404.121321,11.980182,,
min,17.0,,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,,10.0,,,,,,0.0,0.0,40.0,,
75%,47.0,,,13.0,,,,,,0.0,0.0,45.0,,


In [7]:
salary_test.describe(include = 'all')

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
count,15060.0,15060,15060,15060.0,15060,15060,15060,15060,15060,15060.0,15060.0,15060.0,15060,15060
unique,,7,16,,7,14,6,5,2,,,,40,2
top,,Private,HS-grad,,Married-civ-spouse,Exec-managerial,Husband,White,Male,,,,United-States,<=50K
freq,,11021,4943,,6990,1992,6203,12970,10147,,,,13788,11360
mean,38.768327,,,10.112749,,,,,,1120.301594,89.041899,40.951594,,
std,13.380676,,,2.558727,,,,,,7703.181842,406.283245,12.062831,,
min,17.0,,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,,13.0,,,,,,0.0,0.0,45.0,,


In [8]:
salary_train = salary_train.apply(LabelEncoder().fit_transform)
salary_train.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,22,5,9,12,4,0,1,4,1,24,0,39,37,0
1,33,4,9,12,2,3,0,4,1,0,0,12,37,0
2,21,2,11,8,0,5,1,4,1,0,0,39,37,0
3,36,2,1,6,2,5,0,2,1,0,0,39,37,0
4,11,2,9,12,2,9,5,2,0,0,0,39,4,0


In [9]:
salary_test = salary_test.apply(LabelEncoder().fit_transform)
salary_test.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,8,2,1,6,4,6,3,2,1,0,0,39,37,0
1,21,2,11,8,2,4,0,4,1,0,0,49,37,0
2,11,1,7,11,2,10,0,4,1,0,0,39,37,1
3,27,2,15,9,2,6,0,2,1,87,0,39,37,1
4,17,2,0,5,4,7,1,4,1,0,0,29,37,0


In [10]:
x_train = salary_train.drop(['education', 'relationship', 'native', 'maritalstatus', 'sex', 'race'], axis = 1)
y_train = salary_train['Salary']

In [11]:
x_test = salary_test.drop(['education', 'relationship', 'native', 'maritalstatus', 'sex', 'race'], axis = 1)
y_test = salary_test['Salary']

### Gaussian Naive Bayes

In [12]:
classifier = GB()
classifier.fit(x_train, y_train)

GaussianNB()

In [13]:
y_pred = classifier.predict(x_test)

In [14]:
ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [16]:
print('Accuracy Score :' , ac)
print('Confusion Matrix :\n', cm)

Accuracy Score : 1.0
Confusion Matrix :
 [[11360     0]
 [    0  3700]]


In [17]:
accuracy_test = np.mean(y_pred == y_test)
accuracy_test

1.0

### Multinomial Naive Bayes

In [19]:
classifier_mb = MB()
classifier_mb.fit(x_train, y_train)

MultinomialNB()

In [20]:
classifier_mb.score(x_train, y_train)

0.7800802360664434

In [21]:
classifier_mb.score(x_test, y_test)

0.7820053120849934

In [22]:
y_pred_mb = classifier_mb.predict(x_train)

In [24]:
accuracy_train = np.mean(y_pred_mb == y_train)
accuracy_train

0.7800802360664434

In [25]:
test_predict = classifier_mb.predict(x_test)
accuracy_test_1 = np.mean(test_predict == y_test)
accuracy_test_1

0.7820053120849934

In [26]:
table = {'Model' : ['Gaussian NB', 'Multinomial NB'], 'Accuracy' : [accuracy_test, accuracy_test_1]}
table = pd.DataFrame(table)
table

Unnamed: 0,Model,Accuracy
0,Gaussian NB,1.0
1,Multinomial NB,0.782005


## Conclusion

From both the models, it is evident that the Gaussian Model's accuracy is better than the Multinomial Model for the given Naive Bayes problem. Hence Gaussian Model predicts better.