# Naive Bayes - Salary Problem 
## -----------------------------------------------------------------------------------------------------------------------------
## Problem Statement : Prepare a classification model using Naive Bayes for salary data .

## 1. Import necessary libraries:

In [40]:
import pandas as pd
import numpy as np

## 2. Import dataset :

In [2]:
salary_train_data = pd.read_csv('SalaryData_Train.csv',sep=',')
salary_test_data = pd.read_csv('SalaryData_Test.csv',sep=',')

In [3]:
salary_train_data

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30156,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
30157,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
30158,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
30159,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
salary_test_data

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15055,33,Private,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K
15056,39,Private,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
15057,38,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
15058,44,Private,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


## 3. Data Understanding :

### 3.1 Data Description:

* **age** -- age of a person
* **workclass**	-- A work class is a grouping of work 
* **education**	-- Education of an individuals	
* **maritalstatus** -- Marital status of an individulas	
* **occupation**	 -- occupation of an individuals
* **relationship** -- 	
* **race** --  Race of an Individual
* **sex** --  Gender of an Individual
* **capitalgain** --  profit received from the sale of an investment	
* **capitalloss**	-- A decrease in the value of a capital asset
* **hoursperweek** -- number of hours work per week	
* **native** -- Native of an individual
* **Salary** -- salary of an individual

### 3.2 Initial Analysis :

### I. Salary Train Data :

In [5]:
salary_train_data.shape

(30161, 14)

In [7]:
salary_train_data.isna().sum()

age              0
workclass        0
education        0
educationno      0
maritalstatus    0
occupation       0
relationship     0
race             0
sex              0
capitalgain      0
capitalloss      0
hoursperweek     0
native           0
Salary           0
dtype: int64

In [9]:
salary_train_data.dtypes

age               int64
workclass        object
education        object
educationno       int64
maritalstatus    object
occupation       object
relationship     object
race             object
sex              object
capitalgain       int64
capitalloss       int64
hoursperweek      int64
native           object
Salary           object
dtype: object

### II. Salary Test Data :

In [11]:
salary_test_data.shape

(15060, 14)

In [8]:
salary_test_data.isna().sum()

age              0
workclass        0
education        0
educationno      0
maritalstatus    0
occupation       0
relationship     0
race             0
sex              0
capitalgain      0
capitalloss      0
hoursperweek     0
native           0
Salary           0
dtype: int64

In [10]:
salary_test_data.dtypes

age               int64
workclass        object
education        object
educationno       int64
maritalstatus    object
occupation       object
relationship     object
race             object
sex              object
capitalgain       int64
capitalloss       int64
hoursperweek      int64
native           object
Salary           object
dtype: object

## 4. Data Preparation :

In [12]:
# Renaming train data columns :

salary_train_data.columns = ['age', 'work_class', 'education', 'education_no', 'marital_status',
                             'occupation', 'relationship', 'race', 'sex', 'capital_gain',
                             'capital_loss', 'hours_per_week', 'native', 'salary']
salary_test_data.columns = ['age', 'work_class', 'education', 'education_no', 'marital_status',
                             'occupation', 'relationship', 'race', 'sex', 'capital_gain',
                             'capital_loss', 'hours_per_week', 'native', 'salary']

In [13]:
salary_train_data.head()

Unnamed: 0,age,work_class,education,education_no,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native,salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [14]:
salary_test_data.head()

Unnamed: 0,age,work_class,education,education_no,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native,salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


## 5. Data Pre-Processing :

In [17]:
salary_train_data_copy = salary_train_data.copy()
salary_test_data_copy = salary_test_data.copy()

### I. Salary train data transformation :

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
le_train = LabelEncoder()
salary_train_data_copy['work_class'] = le_train.fit_transform(salary_train_data['work_class'])
salary_train_data_copy['education']  = le_train.fit_transform(salary_train_data['education'])
salary_train_data_copy['marital_status'] = le_train.fit_transform(salary_train_data['marital_status'])
salary_train_data_copy['occupation']  = le_train.fit_transform(salary_train_data['occupation'])
salary_train_data_copy['relationship'] = le_train.fit_transform(salary_train_data['relationship'])
salary_train_data_copy['race']  = le_train.fit_transform(salary_train_data['race'])
salary_train_data_copy['sex'] = le_train.fit_transform(salary_train_data['sex'])
salary_train_data_copy['native'] = le_train.fit_transform(salary_train_data['native'])
salary_train_data_copy['salary'] = le_train.fit_transform(salary_train_data['salary'])

### Salary test data transformation :

In [20]:
le_test = LabelEncoder()
salary_test_data_copy['work_class'] = le_test.fit_transform(salary_test_data['work_class'])
salary_test_data_copy['education']  = le_test.fit_transform(salary_test_data['education'])
salary_test_data_copy['marital_status'] = le_test.fit_transform(salary_test_data['marital_status'])
salary_test_data_copy['occupation']  = le_test.fit_transform(salary_test_data['occupation'])
salary_test_data_copy['relationship'] = le_test.fit_transform(salary_test_data['relationship'])
salary_test_data_copy['race']  = le_test.fit_transform(salary_test_data['race'])
salary_test_data_copy['sex'] = le_test.fit_transform(salary_test_data['sex'])
salary_test_data_copy['native'] = le_test.fit_transform(salary_test_data['native'])
salary_test_data_copy['salary'] = le_test.fit_transform(salary_test_data['salary'])

In [21]:
salary_train_data_copy.head()

Unnamed: 0,age,work_class,education,education_no,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native,salary
0,39,5,9,13,4,0,1,4,1,2174,0,40,37,0
1,50,4,9,13,2,3,0,4,1,0,0,13,37,0
2,38,2,11,9,0,5,1,4,1,0,0,40,37,0
3,53,2,1,7,2,5,0,2,1,0,0,40,37,0
4,28,2,9,13,2,9,5,2,0,0,0,40,4,0


In [22]:
salary_test_data_copy.head()

Unnamed: 0,age,work_class,education,education_no,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native,salary
0,25,2,1,7,4,6,3,2,1,0,0,40,37,0
1,38,2,11,9,2,4,0,4,1,0,0,50,37,0
2,28,1,7,12,2,10,0,4,1,0,0,40,37,1
3,44,2,15,10,2,6,0,2,1,7688,0,40,37,1
4,34,2,0,6,4,7,1,4,1,0,0,30,37,0


In [23]:
salary_train_data_copy.dtypes

age               int64
work_class        int32
education         int32
education_no      int64
marital_status    int32
occupation        int32
relationship      int32
race              int32
sex               int32
capital_gain      int64
capital_loss      int64
hours_per_week    int64
native            int32
salary            int32
dtype: object

In [24]:
salary_test_data_copy.dtypes

age               int64
work_class        int32
education         int32
education_no      int64
marital_status    int32
occupation        int32
relationship      int32
race              int32
sex               int32
capital_gain      int64
capital_loss      int64
hours_per_week    int64
native            int32
salary            int32
dtype: object

## 6. Model Building :

### I. Train data:

In [54]:
X_train= salary_train_data_copy.drop(labels= 'salary',axis = 1)
y_train = salary_train_data_copy['salary']

### II. Test data:

In [55]:
X_test = salary_test_data_copy.drop(labels= 'salary',axis = 1)
y_test = salary_test_data_copy['salary']

## 7. Naive Bayes :

### 7.1 Gaussian Naive Bayes :

In [56]:
from sklearn.naive_bayes import GaussianNB

In [57]:
GNB_model = GaussianNB()
gaussian_model = GNB_model.fit(X_train,y_train)
gaussian_train_predict = gaussian_model.predict(X_train)
gaussian_test_predict = gaussian_model.predict(X_test)

In [60]:
gaussian_train_accuracy = np.mean(gaussian_train_predict== y_train)
gaussian_test_accuracy = np.mean(gaussian_test_predict == y_test)

In [63]:
print('Gaussian Train Accuaracy :',round(gaussian_train_accuracy,4))
print('Gaussian Test Accuaracy  :',round(gaussian_test_accuracy,4))

Gaussian Train Accuaracy : 0.7953
Gaussian Test Accuaracy  : 0.7947


## 7.2 Multinomial Naive Bayes :

In [64]:
from sklearn.naive_bayes import MultinomialNB

In [65]:
multiNB_model = MultinomialNB()
multinomial_model = multiNB_model.fit(X_train,y_train)
multinomial_train_predict = multinomial_model.predict(X_train)
multinomial_test_predict = multinomial_model.predict(X_test)

In [66]:
multinomial_train_accuracy = np.mean(multinomial_train_predict== y_train)
multinomial_test_accuracy = np.mean(multinomial_test_predict == y_test)

print('Multinomial Train Accuaracy :',round(multinomial_train_accuracy,4))
print('Multinomial Test Accuaracy  :',round(multinomial_test_accuracy,4))

Multinomial Train Accuaracy : 0.7729
Multinomial Test Accuaracy  : 0.775


### 7.3 Bernoulli Naive Bayes :

In [67]:
from sklearn.naive_bayes import BernoulliNB

In [68]:
bernoulliNB_model = BernoulliNB()
bernoulli_model = bernoulliNB_model.fit(X_train,y_train)
bernoulli_train_predict = bernoulli_model.predict(X_train)
bernoulli_test_predict = bernoulli_model.predict(X_test)

In [70]:
bernoulli_train_accuracy = np.mean(bernoulli_train_predict== y_train)
bernoulli_test_accuracy = np.mean(bernoulli_test_predict == y_test)

print('Bernoulli Train Accuaracy :',round(bernoulli_train_accuracy,4))
print('Bernoulli Test Accuaracy  :',round(bernoulli_test_accuracy,4))

Bernoulli Train Accuaracy : 0.7256
Bernoulli Test Accuaracy  : 0.7284


### 7.4  Categorical Naive Bayes :

In [71]:
from sklearn.naive_bayes import CategoricalNB

In [73]:
categoricalNB_model = CategoricalNB()
categorical_model = categoricalNB_model.fit(X_train,y_train)
categorical_train_predict = categorical_model.predict(X_train)
categorical_test_predict = categorical_model.predict(X_test)

In [75]:
categorical_train_accuracy = np.mean(categorical_train_predict== y_train)
categorical_test_accuracy = np.mean(categorical_test_predict == y_test)

print('Categorical Train Accuaracy :',round(categorical_train_accuracy,4))
print('Categorical Test Accuaracy  :',round(categorical_test_accuracy,4))

Categorical Train Accuaracy : 0.8571
Categorical Test Accuaracy  : 0.8566


### 7.5 Complement Naive Bayes :

In [78]:
from sklearn.naive_bayes import ComplementNB

In [80]:
complementNB_model = ComplementNB()
complement_model = complementNB_model.fit(X_train,y_train)
complement_train_predict = complement_model.predict(X_train)
complement_test_predict = complement_model.predict(X_test)

In [81]:
complement_train_accuracy = np.mean(complement_train_predict== y_train)
complement_test_accuracy = np.mean(complement_test_predict == y_test)

print('Complement Train Accuaracy :',round(complement_train_accuracy,4))
print('Complement Test Accuaracy  :',round(complement_test_accuracy,4))

Complement Train Accuaracy : 0.7729
Complement Test Accuaracy  : 0.775


## RESULTS :

In [85]:
data = {'Naive Bayes Models':['Gaussian_NB','Multinomial_NB','Bernoulli_NB','Categorical_NB','Complement_NB'],
       'Train Accuracy_Scores':[round(gaussian_train_accuracy,4)*100,round(multinomial_train_accuracy,4)*100,round(bernoulli_train_accuracy,4)*100,round(categorical_train_accuracy,4)*100,round(complement_train_accuracy,4)*100],
       'Test Accuracy_Scores':[round(gaussian_test_accuracy,4)*100,round(multinomial_test_accuracy,4)*100,round(bernoulli_test_accuracy,4)*100,round(categorical_test_accuracy,4)*100,round(complement_test_accuracy,4)*100]}
results = pd.DataFrame(data)
results

Unnamed: 0,Naive Bayes Models,Train Accuracy_Scores,Test Accuracy_Scores
0,Gaussian_NB,79.53,79.47
1,Multinomial_NB,77.29,77.5
2,Bernoulli_NB,72.56,72.84
3,Categorical_NB,85.71,85.66
4,Complement_NB,77.29,77.5


The best model the **Categorical Naive Bayes** as the accuracy obtained is as follows :

* **Train Model Accuracy :** 85.71 %
* **Test Model Accuracy  :** 85.66 %