#### Import Libraries:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

#### Import Data:

In [2]:
df = pd.read_csv("adult.csv")
print(df.shape)
df.head()

(32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Check for NaN:

In [3]:
df.isna().any()

age                False
workclass          False
fnlwgt             False
education          False
educational-num    False
marital-status     False
occupation         False
relationship       False
race               False
gender             False
capital-gain       False
capital-loss       False
hours-per-week     False
native-country     False
income             False
dtype: bool

#### Handle Question Marks:

In [4]:
for col in df.columns:
    df[col] = df[col].replace("?", np.NaN)

df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Normalization

In [5]:
df.replace(['Divorced', 'Married-AF-spouse', 
              'Married-civ-spouse', 'Married-spouse-absent', 
              'Never-married','Separated','Widowed'],
             ['divorced','married','married','married',
              'not married','not married','not married'], inplace = True)

#### LabelEncoder Function:

In [6]:
category_col =['workclass', 'race', 'education','marital-status', 'occupation','relationship', 'gender', 'native-country', 'income'] 
labelEncoder = preprocessing.LabelEncoder()

#### Map Data:

In [7]:
mapping_dict={}
for col in df.columns:
    df[col] = labelEncoder.fit_transform(df[col])
    le_name_mapping = dict(zip(labelEncoder.classes_, labelEncoder.transform(labelEncoder.classes_)))
    mapping_dict[col]=le_name_mapping
print(mapping_dict)

{'age': {17: 0, 18: 1, 19: 2, 20: 3, 21: 4, 22: 5, 23: 6, 24: 7, 25: 8, 26: 9, 27: 10, 28: 11, 29: 12, 30: 13, 31: 14, 32: 15, 33: 16, 34: 17, 35: 18, 36: 19, 37: 20, 38: 21, 39: 22, 40: 23, 41: 24, 42: 25, 43: 26, 44: 27, 45: 28, 46: 29, 47: 30, 48: 31, 49: 32, 50: 33, 51: 34, 52: 35, 53: 36, 54: 37, 55: 38, 56: 39, 57: 40, 58: 41, 59: 42, 60: 43, 61: 44, 62: 45, 63: 46, 64: 47, 65: 48, 66: 49, 67: 50, 68: 51, 69: 52, 70: 53, 71: 54, 72: 55, 73: 56, 74: 57, 75: 58, 76: 59, 77: 60, 78: 61, 79: 62, 80: 63, 81: 64, 82: 65, 83: 66, 84: 67, 85: 68, 86: 69, 87: 70, 88: 71, 90: 72}, 'workclass': {' ?': 0, ' Federal-gov': 1, ' Local-gov': 2, ' Never-worked': 3, ' Private': 4, ' Self-emp-inc': 5, ' Self-emp-not-inc': 6, ' State-gov': 7, ' Without-pay': 8}, 'fnlwgt': {12285: 0, 13769: 1, 14878: 2, 18827: 3, 19214: 4, 19302: 5, 19395: 6, 19410: 7, 19491: 8, 19520: 9, 19700: 10, 19752: 11, 19793: 12, 19847: 13, 19899: 14, 19914: 15, 20057: 16, 20098: 17, 20101: 18, 20109: 19, 20179: 20, 20296: 21

#### Drop Unneeded Columns:

In [8]:
df=df.drop(['fnlwgt','educational-num'], axis=1)


#### Split X and y:

In [9]:
X = df.values[:, 0:12]
Y = df.values[:,12]

#### Split Train & Test Data:

In [10]:
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)

#### Train a Decision Tree Classifier:

In [12]:
dtc = DecisionTreeClassifier(criterion = "gini", 
                                     random_state = 100,
                                     max_depth=5, 
                                     min_samples_leaf=5)

dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.94      0.89      7428
           1       0.73      0.47      0.57      2341

    accuracy                           0.83      9769
   macro avg       0.79      0.71      0.73      9769
weighted avg       0.82      0.83      0.82      9769



#### Simple Hyperparameter Tuning using GridSearchCV

In [13]:
from sklearn.model_selection import GridSearchCV

parameters = {
              'criterion':('gini', 'entropy'), 
              'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'max_leaf_nodes':[2, 3, 4, 5, 6, 7, 8, 9, 10],
              }

clf = GridSearchCV(dtc, parameters)

clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=5,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=5,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=100,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ('gini', 'entropy'),
                    

In [14]:
sorted(clf.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_criterion',
 'param_max_depth',
 'param_max_leaf_nodes',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [15]:
clf.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=7, max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=100, splitter='best')

#### Confirm Classification Report using new Parameters:

In [18]:
dtc = DecisionTreeClassifier(criterion = "gini", 
                                     random_state = 100,
                                     max_depth=7, 
                                     min_samples_leaf=5)

dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91      7428
           1       0.78      0.54      0.64      2341

    accuracy                           0.85      9769
   macro avg       0.82      0.74      0.77      9769
weighted avg       0.85      0.85      0.84      9769



#### Save Model using Pickle:

In [19]:
import pickle
pickle.dump(dtc, open("DTCmodel.pkl","wb"))