In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score
df = pd.read_csv('adult.csv', names = ['Age','Workclass','Fnlwgt','Education','education_num','marital_status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','income'])
df.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
df.shape

(32561, 15)

In [3]:
df.isnull().sum()

Age               0
Workclass         0
Fnlwgt            0
Education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [4]:
df.dtypes.value_counts()

object    9
int64     6
dtype: int64

In [5]:
df.duplicated().sum()

24

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.dtypes

Age                int64
Workclass         object
Fnlwgt             int64
Education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

In [8]:
lb = LabelEncoder()
df['Workclass'] = lb.fit_transform(df['Workclass'])
df['Education'] = lb.fit_transform(df['Education'])
df['marital_status'] = lb.fit_transform(df['marital_status'])
df['occupation'] = lb.fit_transform(df['occupation'])
df['relationship'] = lb.fit_transform(df['relationship'])
df['race'] = lb.fit_transform(df['race'])
df['sex'] = lb.fit_transform(df['sex'])
df['native_country'] = lb.fit_transform(df['native_country'])
df['income'] = lb.fit_transform(df['income'])

In [9]:
df.dtypes

Age               int64
Workclass         int32
Fnlwgt            int64
Education         int32
education_num     int64
marital_status    int32
occupation        int32
relationship      int32
race              int32
sex               int32
capital_gain      int64
capital_loss      int64
hours_per_week    int64
native_country    int32
income            int32
dtype: object

In [10]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
print(x.shape)
print(y.shape)

(32537, 14)
(32537,)


In [11]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=32)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(26029, 14)
(6508, 14)
(26029,)
(6508,)


In [12]:
def mscore(model):
    print('Training Score',model.score(x_train,y_train))
    print('Testing Score',model.score(x_test,y_test))

def gen_matrix(y_test,y_pred):
    cm  = confusion_matrix(y_test,y_pred)
    print(cm)
    print(classification_report(y_test,y_pred,zero_division=0))
    print('Accuracy Score',accuracy_score(y_test,y_pred))
    p=cm[0][0]/(cm[0][0]+cm[1][0])
    r=cm[0][0]/(cm[0][0]+cm[0][1])
    print('Precision as calculated from the Confusion Matrix is: ',p )
    print('Recall as calculated from the Confusion Matrix is: ',r )
    print('F1-score as calculated from the Confusion Matrix is: ', (2*p*r)/(p+r))
    print('Accuracy as calculated from the Confusion Matrix is: ', (cm[1][1]+cm[0][0])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1]))
    print('Percentage of misclassification: ', ((cm[0][1]+cm[1][0])*100)/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1]), end='%')

**1)DECISION TREE CLASSIFIER**

In [13]:
#1) Decision Tree Classifier
m1 = DecisionTreeClassifier(criterion='entropy',max_depth=11,min_samples_split=50)
m1.fit(x_train,y_train)
mscore(m1)
ypred_m1 = m1.predict(x_test)
gen_matrix(y_test,ypred_m1)

Training Score 0.8662645510776442
Testing Score 0.8529502151198525
[[4552  390]
 [ 567  999]]
              precision    recall  f1-score   support

           0       0.89      0.92      0.90      4942
           1       0.72      0.64      0.68      1566

    accuracy                           0.85      6508
   macro avg       0.80      0.78      0.79      6508
weighted avg       0.85      0.85      0.85      6508

Accuracy Score 0.8529502151198525
Precision as calculated from the Confusion Matrix is:  0.8892361789411994
Recall as calculated from the Confusion Matrix is:  0.9210845811412384
F1-score as calculated from the Confusion Matrix is:  0.9048802305933804
Accuracy as calculated from the Confusion Matrix is:  0.8529502151198525
Percentage of misclassification:  14.704978488014751%

**2)RANDOM FOREST CLASSIFIER**

In [14]:
#2)Random Forest Classifier
m2 = RandomForestClassifier(n_estimators=500,criterion='entropy',max_depth=11,min_samples_split=50)
m2.fit(x_train,y_train)
mscore(m2)
ypred_m2 = m2.predict(x_test)
gen_matrix(y_test,ypred_m2)

Training Score 0.8636136616850436
Testing Score 0.864320835894284
[[4749  193]
 [ 690  876]]
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      4942
           1       0.82      0.56      0.66      1566

    accuracy                           0.86      6508
   macro avg       0.85      0.76      0.79      6508
weighted avg       0.86      0.86      0.85      6508

Accuracy Score 0.864320835894284
Precision as calculated from the Confusion Matrix is:  0.873138444567016
Recall as calculated from the Confusion Matrix is:  0.9609469850263052
F1-score as calculated from the Confusion Matrix is:  0.9149407571524901
Accuracy as calculated from the Confusion Matrix is:  0.864320835894284
Percentage of misclassification:  13.567916410571604%

**3)KNN CLASSIFIER**

In [15]:
#3)KNN Classifier
m3 = KNeighborsClassifier(n_neighbors=25)
m3.fit(x_train,y_train)
mscore(m3)
ypred_m3 = m3.predict(x_test)
gen_matrix(y_test,ypred_m3)


Training Score 0.8020285066656422
Testing Score 0.8017824216349109
[[4897   45]
 [1245  321]]
              precision    recall  f1-score   support

           0       0.80      0.99      0.88      4942
           1       0.88      0.20      0.33      1566

    accuracy                           0.80      6508
   macro avg       0.84      0.60      0.61      6508
weighted avg       0.82      0.80      0.75      6508

Accuracy Score 0.8017824216349109
Precision as calculated from the Confusion Matrix is:  0.7972972972972973
Recall as calculated from the Confusion Matrix is:  0.990894374747066
F1-score as calculated from the Confusion Matrix is:  0.8836160230963552
Accuracy as calculated from the Confusion Matrix is:  0.8017824216349109
Percentage of misclassification:  19.82175783650891%

**4)LOGISTIC REGRESSION**

In [16]:
#4)Logistic Regression
m4 = LogisticRegression(max_iter=1000,solver='liblinear')
m4.fit(x_train,y_train)
m4.fit(x_train,y_train)
mscore(m4)
ypred_m4 = m4.predict(x_test)
gen_matrix(y_test,ypred_m4)


Training Score 0.7920396480848284
Testing Score 0.7947141979102643
[[4688  254]
 [1082  484]]
              precision    recall  f1-score   support

           0       0.81      0.95      0.88      4942
           1       0.66      0.31      0.42      1566

    accuracy                           0.79      6508
   macro avg       0.73      0.63      0.65      6508
weighted avg       0.77      0.79      0.77      6508

Accuracy Score 0.7947141979102643
Precision as calculated from the Confusion Matrix is:  0.8124783362218371
Recall as calculated from the Confusion Matrix is:  0.9486038041278835
F1-score as calculated from the Confusion Matrix is:  0.8752800597460791
Accuracy as calculated from the Confusion Matrix is:  0.7947141979102643
Percentage of misclassification:  20.528580208973572%

**5)SVM CLASSIFIER**

In [17]:
#5)SVM Classifier
m5 = SVC(kernel='linear')
m5.fit(x_train,y_train)
m5.fit(x_train,y_train)
mscore(m5)
ypred_m5 = m5.predict(x_test)
gen_matrix(y_test,ypred_m5)

Training Score 0.7961120288908525
Testing Score 0.7968653964351567
[[4729  213]
 [1109  457]]
              precision    recall  f1-score   support

           0       0.81      0.96      0.88      4942
           1       0.68      0.29      0.41      1566

    accuracy                           0.80      6508
   macro avg       0.75      0.62      0.64      6508
weighted avg       0.78      0.80      0.76      6508

Accuracy Score 0.7968653964351567
Precision as calculated from the Confusion Matrix is:  0.8100376841384036
Recall as calculated from the Confusion Matrix is:  0.9569000404694455
F1-score as calculated from the Confusion Matrix is:  0.8773654916512059
Accuracy as calculated from the Confusion Matrix is:  0.7968653964351567
Percentage of misclassification:  20.313460356484327%

The Model with best Accuracy is **Random Forest Classifier**