# Name: Amey Dhote 
# Email: ameydhotesocialclub@gmail.com

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
%matplotlib inline

In [2]:
df = pd.read_csv('adult.csv')

In [3]:
df.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [4]:
# Renaming columns 

df.columns = ['Age', 'Workclass',' Fnlwgt', 'Education', 
              'education_num', 'marital_status', 'occupation', 'relationship','race',
             'sex','capital_gain','capital_loss','hours_per_week','native_country','income']

In [5]:
df.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [6]:
# No null values 

df.isna().sum() 

Age               0
Workclass         0
 Fnlwgt           0
Education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [7]:
type(df['income'][0])

str

In [8]:
df.dtypes
# we have to convert all the object or string values into float

Age                int64
Workclass         object
 Fnlwgt            int64
Education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
le = LabelEncoder()

In [11]:
objects_columns = ["Workclass","Education","marital_status","occupation",
                   "relationship","race","sex","native_country","income"]

In [12]:
for column in objects_columns:
    df[column] = le.fit_transform(df[column])

In [13]:
df.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
1,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
2,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
3,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
4,37,4,284582,12,14,2,4,5,4,0,0,0,40,39,0


In [14]:
df.dtypes

Age               int64
Workclass         int32
 Fnlwgt           int64
Education         int32
education_num     int64
marital_status    int32
occupation        int32
relationship      int32
race              int32
sex               int32
capital_gain      int64
capital_loss      int64
hours_per_week    int64
native_country    int32
income            int32
dtype: object

In [15]:
df['income']
# 0 means income <=50K
# 1 means income >50K

0        0
1        0
2        0
3        0
4        0
        ..
32555    0
32556    1
32557    0
32558    0
32559    1
Name: income, Length: 32560, dtype: int32

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X = df.drop(['income'], axis=1)
y = df['income']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [19]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# a. Decision Tree

In [20]:
from sklearn.tree import DecisionTreeClassifier

In [21]:
dtc = DecisionTreeClassifier()

In [22]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier()

In [23]:
dtc_predictions = dtc.predict(X_test)

In [24]:
from sklearn.metrics import confusion_matrix, classification_report

In [25]:
print(confusion_matrix(y_test, dtc_predictions))
print('\n')
print(classification_report(y_test, dtc_predictions))

[[7035 1081]
 [1030 1599]]


              precision    recall  f1-score   support

           0       0.87      0.87      0.87      8116
           1       0.60      0.61      0.60      2629

    accuracy                           0.80     10745
   macro avg       0.73      0.74      0.74     10745
weighted avg       0.80      0.80      0.80     10745



### Decision Tree Misclassification percentage 

missclassification rate = 1 - Accuracy 
<br>
                          1 - 0.80 = 0.20
<br>
                            20%


# b. Random Forest Classifier

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
rfc = RandomForestClassifier()

In [28]:
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [29]:
rfc_predictions = rfc.predict(X_test)

In [30]:
print(confusion_matrix(y_test, rfc_predictions))
print('\n')
print(classification_report(y_test, rfc_predictions))

[[7575  541]
 [ 997 1632]]


              precision    recall  f1-score   support

           0       0.88      0.93      0.91      8116
           1       0.75      0.62      0.68      2629

    accuracy                           0.86     10745
   macro avg       0.82      0.78      0.79     10745
weighted avg       0.85      0.86      0.85     10745



###  Random Forest Classifier Misclassification percentage 

missclassification rate = 1 - Accuracy 
<br>
                          1 - 0.86 = 0.14
<br>
                            14%

# c. Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
lr = LogisticRegression()

In [33]:
lr.fit(X_train, y_train)

LogisticRegression()

In [34]:
lr_predictions = lr.predict(X_test)

In [35]:
print(confusion_matrix(y_test, lr_predictions))
print('\n')
print(classification_report(y_test, lr_predictions))

[[7632  484]
 [1437 1192]]


              precision    recall  f1-score   support

           0       0.84      0.94      0.89      8116
           1       0.71      0.45      0.55      2629

    accuracy                           0.82     10745
   macro avg       0.78      0.70      0.72     10745
weighted avg       0.81      0.82      0.81     10745



### Logistic Regression Misclassification percentage 

missclassification rate = 1 - Accuracy 
<br>
                          1 - 0.82 = 0.18
<br>                           18%
                          

# d. KNN Classifier

In [36]:
from sklearn.neighbors import KNeighborsClassifier

In [37]:
knn = KNeighborsClassifier(n_neighbors=39)

In [38]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=39)

In [39]:
knn_predictions = knn.predict(X_test)

In [40]:
print(confusion_matrix(y_test, knn_predictions))
print('\n')
print(classification_report(y_test, knn_predictions))

[[7519  597]
 [1189 1440]]


              precision    recall  f1-score   support

           0       0.86      0.93      0.89      8116
           1       0.71      0.55      0.62      2629

    accuracy                           0.83     10745
   macro avg       0.79      0.74      0.76     10745
weighted avg       0.83      0.83      0.83     10745



### KNN Classifier Misclassification percentage 

missclassification rate = 1 - Accuracy 
<br>
                          1 - 0.83 = 0.17
<br>                            17%

# e. SVC Classifier (with linear kernel)

In [41]:
from sklearn.svm import SVC  

In [42]:
svc = SVC(kernel='linear') 

In [43]:
svc.fit(X_train, y_train)

SVC(kernel='linear')

In [44]:
svc_predictions = svc.predict(X_test)

In [45]:
print(confusion_matrix(y_test, svc_predictions))
print('\n')
print(classification_report(y_test, svc_predictions))

[[7923  193]
 [1785  844]]


              precision    recall  f1-score   support

           0       0.82      0.98      0.89      8116
           1       0.81      0.32      0.46      2629

    accuracy                           0.82     10745
   macro avg       0.82      0.65      0.67     10745
weighted avg       0.82      0.82      0.78     10745



### SVC Classifier (with linear kernel) Misclassification percentage 

missclassification rate = 1 - Accuracy
<br>
                         1 - 0.82 = 0.18                        
                         18%

 # Model with best accuracy
 
 Accuracies:<br>
 
  Decision Tree = 80% <br>
  Random Forest Classifier = 86% <br>
  Logistic Regression = 82% <br>
  KNN Classifier = 83% <br>
  SVC Classifier (with linear kernel) = 82% <br>
  
  ## Therefore, "Random Forest Classifier" is the best performing model with an accuracy of 86% and 0.91 or 91%       f1-score for person making over $50k a year.

  