In [1]:
# -----------------------------------------------------------------
# Decision Tree Classifier
# Predict the income of an adult based on the census data
# -----------------------------------------------------------------

# Import libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier



In [4]:
# Read dataset
adul = pd.read_csv('decisiontreeAdultIncome.csv')

In [5]:
# Check for Null values
adul.isnull().sum()

age               0
wc                0
education         0
marital status    0
race              0
gender            0
hours per week    0
IncomeClass       0
dtype: int64

In [8]:
# Create Dummy variables
adul = pd.get_dummies(adul, drop_first=True)
adul.head()


Unnamed: 0,age,hours per week,wc_ Local-gov,wc_ Never-worked,wc_ Private,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college,marital status_ Never-married,marital status_ Widowed,marital status_Married,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,gender_ Male,IncomeClass_ >50K
0,38,40,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
1,28,40,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2,37,40,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0
3,31,50,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1
4,42,40,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1


In [9]:
adul.columns

Index(['age', 'hours per week', 'wc_ Local-gov', 'wc_ Never-worked',
       'wc_ Private', 'education_ Doctorate', 'education_ HS-grad',
       'education_ Masters', 'education_ Preschool', 'education_ Prof-school',
       'education_ Some-college', 'marital status_ Never-married',
       'marital status_ Widowed', 'marital status_Married',
       'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White',
       'gender_ Male', 'IncomeClass_ >50K'],
      dtype='object')

In [14]:
# Create X and Y Variables
Y = adul['IncomeClass_ >50K']
X= adul.drop(['IncomeClass_ >50K'], axis=1)

In [15]:
# Split the X and Y dataset into training and testing set
# Split the X and Y dataset into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1234, stratify=Y)

In [18]:
# Import and train classifier
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train,Y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [17]:
# Test the model


In [20]:
# Evaluate the model
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))
print("Classification Report for Decision Tree:""\n",metrics.classification_report(Y_test, y_pred))


Accuracy: 0.7714333838639044
Classification Report for Decision Tree:
               precision    recall  f1-score   support

           0       0.83      0.87      0.85      4373
           1       0.58      0.49      0.53      1564

    accuracy                           0.77      5937
   macro avg       0.70      0.68      0.69      5937
weighted avg       0.76      0.77      0.77      5937



# Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
lr= LogisticRegression()
lr.fit(X_train, Y_train)

preds = lr.predict(X_test)
preds

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [24]:

from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(Y_test, preds)
cm

array([[4004,  369],
       [ 753,  811]], dtype=int64)

In [25]:

score = lr.score(X_test, Y_test)
score

0.8110156644770086

In [26]:

cr  = classification_report(Y_test, preds)
print(cr)

              precision    recall  f1-score   support

           0       0.84      0.92      0.88      4373
           1       0.69      0.52      0.59      1564

    accuracy                           0.81      5937
   macro avg       0.76      0.72      0.73      5937
weighted avg       0.80      0.81      0.80      5937



In [27]:
score = lr.score(X_test, Y_test)
score

0.8110156644770086

# SVM

In [30]:

#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, Y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [31]:
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))

Accuracy: 0.8052888664308573


In [32]:

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(Y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(Y_test, y_pred))

Precision: 0.7090163934426229
Recall: 0.4424552429667519


In [33]:
print("Classification Report for SVM:""\n",classification_report(Y_test, y_pred))


Classification Report for SVM:
               precision    recall  f1-score   support

           0       0.82      0.94      0.88      4373
           1       0.71      0.44      0.54      1564

    accuracy                           0.81      5937
   macro avg       0.77      0.69      0.71      5937
weighted avg       0.79      0.81      0.79      5937



In [34]:

cm = confusion_matrix(Y_test, y_pred)
cm

array([[4089,  284],
       [ 872,  692]], dtype=int64)

# Naive Bayes

In [35]:
from sklearn.naive_bayes import GaussianNB
NBmodel = GaussianNB()
NBmodel.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [36]:

predicted = NBmodel.predict(X_test)
predicted

array([1, 0, 0, ..., 1, 0, 0], dtype=uint8)

In [37]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [38]:

cm2 = confusion_matrix(Y_test,predicted)
score2 = accuracy_score(Y_test,predicted)
cr2 = classification_report(Y_test,predicted)
auc2 = roc_auc_score(Y_test,predicted)

In [39]:
print('Confusion Matrix for Naive Bayes:''\n',cm2 )
print('Accuracy Score for Naive Bayes:',score2 )
print('Classification Report for Naive Bayes:''\n''\n',cr2)
print('Auc Score for Naive Bayes:',auc2 )


Confusion Matrix for Naive Bayes:
 [[2173 2200]
 [  70 1494]]
Accuracy Score for Naive Bayes: 0.617652012801078
Classification Report for Naive Bayes:

               precision    recall  f1-score   support

           0       0.97      0.50      0.66      4373
           1       0.40      0.96      0.57      1564

    accuracy                           0.62      5937
   macro avg       0.69      0.73      0.61      5937
weighted avg       0.82      0.62      0.63      5937

Auc Score for Naive Bayes: 0.7260779206044063
