In [93]:
import warnings 
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/adult-census-income/adult.csv


# Task 3

Building a **decision tree classifier** to predict whether an **individual's income** is above or below **$50,000 per year**.

## Import and Preview Dataset

In [77]:
df = pd.read_csv("/kaggle/input/adult-census-income/adult.csv")

In [78]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


## Data Cleaning

In [79]:
df[df == '?'] = np.nan
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  31978 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [80]:
for col in ['workclass', 'occupation', 'native.country']:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [81]:
df = df.drop_duplicates()

In [82]:
# Specify the columns to handle outliers
numeric_cols = ['age', 'education.num', 'capital.gain', 'hours.per.week']

# Define your outlier handling strategy for each column
outlier_handling = {
    'age': 'winsorize',
    'education.num': 'transform',
    'capital.gain': 'remove',
    'hours.per.week': 'winsorize'
}

In [83]:
# Loop over the specified columns and apply the corresponding outlier handling strategy
for col in numeric_cols:
    strategy = outlier_handling[col]

    if strategy == 'remove':
        # Remove rows with outliers
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    elif strategy == 'transform':
        # Apply a transformation (e.g., log transformation) to reduce the impact of outliers
        df[col] = np.log1p(df[col])
    elif strategy == 'winsorize':
        # Apply winsorization to cap extreme values
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
        df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])

### Checking the Datatypes

In [84]:
df.dtypes

age               float64
workclass          object
fnlwgt              int64
education          object
education.num     float64
marital.status     object
occupation         object
relationship       object
race               object
sex                object
capital.gain        int64
capital.loss        int64
hours.per.week    float64
native.country     object
income             object
dtype: object

### Converting Datatypes

In [85]:
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})

# Convert age and education.num to integer data type
df['age'] = df['age'].astype(int)
df['education.num'] = df['education.num'].astype(int)
df['hours.per.week'] = df['hours.per.week'].astype(int)

# Convert categorical variables into numeric representations using one-hot encoding
categorical_cols = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

### Create training and testing set

In [86]:
# Separate features (X) and target variable (y)
X = df.drop('income', axis=1)
y = df['income']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Building the Decision Tree Classifier

In [87]:
# Create a Decision Tree Classifier with hyperparameter tuning
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

### Finding the best hyperparameters

In [88]:
clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [89]:
best_clf = grid_search.best_estimator_

By using techniques such as **hyperparameter tuning**, **cross-validation**, and **feature selection**, we aimed to build an efficient and robust model.

In [90]:
best_clf.fit(X_train, y_train)

In [91]:
y_pred = best_clf.predict(X_test)

### Evaluation of the Model

In [97]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.837384744341995


An **accuracy of 83%** was achieved using the **Decision Tree** Classifier. 

### Classification and Confusion Matrix

In [98]:
# Print classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.92      0.90      4790
           1       0.61      0.48      0.54      1175

    accuracy                           0.84      5965
   macro avg       0.74      0.70      0.72      5965
weighted avg       0.83      0.84      0.83      5965



In [99]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[4428  362]
 [ 608  567]]


In conclusion, the **Decision Tree Classifier** proved to be a valuable tool for predicting income based on the **Adult Census Income dataset**. By leveraging the power of decision nodes and splits, the model demonstrated a promising ability to discern **patterns and relationships** within the data, enabling it to make **accurate predictions**.

The Decision Tree classifier successfully categorized individuals into two groups: earning more than 50K dollars per year and those **earning less than or equal to 50K dollars per year**.The achieved **accuracy of 83%** showcases the model's ability to make reliable predictions. However, as with any predictive model, there are always opportunities for further improvement. We can explore alternative algorithms, such as **Random Forest or Gradient Boosting**, to potentially enhance predictive accuracy beyond the current level.