### Description:
**In this notebook, we are going to predict whether a person's income is above 50k or below 50k using various features like age, education, and occupation.**

**The dataset we are going to use is the Adult census income dataset which contains about 32561 rows and 15 features that can be downloaded here.**

**The dataset contains the labels which we have to predict and the labels are discrete and binary. So the problem we have is a Supervised Classification type.**

## import requied Libraries


In [24]:
## import requied Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")


# to visualize all the columns in the dataframe
pd.pandas.set_option('display.max_columns',None)

In [25]:
#import test set from UCI links


test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test' , skiprows = 1, header = None)

col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num','marital_status', 'occupation','relationship', 
              'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'wage_class']

test_set.columns = col_labels

test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [26]:
## Description about the dataframe:

print("Variable present in the DataFrame",test_set.columns)
print("\n",70*"=")
print("Information about DataFrame: ",test_set.info())
print("\n",70*"=")
print("Shape of the DataFrame ROWS {a} and COLUMNS {b} ".format(a=test_set.shape[0],b=test_set.shape[1]))
print("\n",70*"=")

Variable present in the DataFrame Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'wage_class'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             16281 non-null  int64 
 1   workclass       16281 non-null  object
 2   fnlwgt          16281 non-null  int64 
 3   education       16281 non-null  object
 4   education_num   16281 non-null  int64 
 5   marital_status  16281 non-null  object
 6   occupation      16281 non-null  object
 7   relationship    16281 non-null  object
 8   race            16281 non-null  object
 9   sex             16281 non-null  object
 10  capital_gain    16281 non-null  int64 
 11  capital_loss    16281 n

In [27]:
# Viewing the data statistics
print("Summary about the dataset:")
test_set.describe()


Summary about the dataset:


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,16281.0,16281.0,16281.0,16281.0,16281.0,16281.0
mean,38.767459,189435.7,10.072907,1081.905104,87.899269,40.392236
std,13.849187,105714.9,2.567545,7583.935968,403.105286,12.479332
min,17.0,13492.0,1.0,0.0,0.0,1.0
25%,28.0,116736.0,9.0,0.0,0.0,40.0
50%,37.0,177831.0,10.0,0.0,0.0,40.0
75%,48.0,238384.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,3770.0,99.0


In [28]:
test_set.describe(include="O")

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,wage_class
count,16281,16281,16281,16281,16281,16281,16281,16281,16281
unique,9,16,7,15,6,5,2,41,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K.
freq,11210,5283,7403,2032,6523,13946,10860,14662,12435


In [29]:
#Check for missing values

print("Number of missing values each columns: ","\n",test_set.isnull().sum())  

# No missing value presents

Number of missing values each columns:  
 age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
wage_class        0
dtype: int64


In [30]:
test_set[["education","education_num"]]

Unnamed: 0,education,education_num
0,11th,7
1,HS-grad,9
2,Assoc-acdm,12
3,Some-college,10
4,Some-college,10
...,...,...
16276,Bachelors,13
16277,HS-grad,9
16278,Bachelors,13
16279,Bachelors,13


In [31]:
test_set.drop(["education"], axis=1,inplace=True)

In [32]:
test_set[['occupation','workclass','native_country']]

Unnamed: 0,occupation,workclass,native_country
0,Machine-op-inspct,Private,United-States
1,Farming-fishing,Private,United-States
2,Protective-serv,Local-gov,United-States
3,Machine-op-inspct,Private,United-States
4,?,?,United-States
...,...,...,...
16276,Prof-specialty,Private,United-States
16277,?,?,United-States
16278,Prof-specialty,Private,United-States
16279,Adm-clerical,Private,United-States


In [33]:
# changing "?" with unknown
test_set['occupation'] = test_set['occupation'].str.replace('?', 'unknown')
test_set['workclass'] = test_set['workclass'].str.replace('?', 'unknown')
test_set['native_country'] = test_set['native_country'].str.replace('?', 'unknown')

# Feature Engineering

In [34]:
#Find out distinct values in each columns
for feature in test_set:
    print(feature,':', test_set[feature].nunique())

age : 73
workclass : 9
fnlwgt : 12787
education_num : 16
marital_status : 7
occupation : 15
relationship : 6
race : 5
sex : 2
capital_gain : 113
capital_loss : 82
hours_per_week : 89
native_country : 41
wage_class : 2


In [35]:
test_set["wage_class"].value_counts()

 <=50K.    12435
 >50K.      3846
Name: wage_class, dtype: int64

In [36]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
columns = ['workclass', 'marital_status', 'occupation', 
           'relationship', 'race', 'sex', 'native_country',"wage_class"]

    
for column in columns:
    test_set[column] = LabelEncoder().fit_transform(test_set[column])    
    
    
#replace function or mapping function is also used  
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,3,226802,7,4,6,3,2,1,0,0,40,37,0
1,38,3,89814,9,2,4,0,4,1,0,0,50,37,0
2,28,1,336951,12,2,10,0,4,1,0,0,40,37,1
3,44,3,160323,10,2,6,0,2,1,7688,0,40,37,1
4,18,8,103497,10,4,14,3,4,0,0,0,30,37,0


In [37]:
test_set["wage_class"].value_counts()

0    12435
1     3846
Name: wage_class, dtype: int64

In [38]:
new_data=test_set[['education_num',
 'relationship',
 'age',
 'hours_per_week',
 'capital_gain',
 'sex',
 'marital_status',"wage_class"]]
new_data.head()


#By using feature selction method select the top 10 features which have highest score.

Unnamed: 0,education_num,relationship,age,hours_per_week,capital_gain,sex,marital_status,wage_class
0,7,3,25,40,0,1,4,0
1,9,0,38,50,0,1,2,0
2,12,0,28,40,0,1,2,1
3,10,0,44,40,7688,1,2,1
4,10,3,18,30,0,0,4,0


In [39]:
X = new_data.drop(columns=['wage_class'])
y=new_data["wage_class"] 

In [40]:
# save the model to disk
## import the module:

from xgboost import XGBClassifier
import pickle
from pathlib import Path

filepath = r'C:/Users/sony/A_MACHINE LEARNING AND DEEP LEARNING- INEURON/INEURON-PYTHON ASSIGNMENT/XGBOOST/XGboost_assignment_final/XGBOOST_Classifier_pickle.pckl'
xgb_model=pickle.load(open(filepath, 'rb'))
xgb_model

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.4,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.1, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.3, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=12, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=1, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [41]:
y_pred= xgb_model.predict(X)

In [42]:
df = pd.DataFrame(y_pred, columns=["wage_class_new"])
df["wage_class_new"].value_counts()

0    11217
1     5064
Name: wage_class_new, dtype: int64