# Assignment is below at the end

- https://scikit-learn.org/stable/modules/tree.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html

In [226]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)
plt.rcParams['font.size'] = 14
import pandas as pd

In [227]:
df = pd.read_csv('../data/adult.data', index_col=False)

In [228]:
golden = pd.read_csv('../data/adult.test', index_col=False)

# For the following use the above `adult` dataset. 

# 1. Show the RandomForest outperforms the DecisionTree for a fixed `max_depth` by training using the train set and calculate `precision`, `recall`, `f1`, `confusion matrix` on golden-test set. Start with only numerical features/columns. (age, education-num, capital-gain, capital-loss, hours-per-week) 

In [229]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)
plt.rcParams['font.size'] = 14
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, auc, roc_curve
)

In [1]:
import sklearn

sklearn.__version__

'1.0.2'

In [230]:
# IMPORT DATA
# df = training set; golden = test set
df = pd.read_csv('../data/adult.data', index_col=False)
golden = pd.read_csv('../data/adult.test', index_col=False)

In [231]:
# Inititate models and encoders
enc = preprocessing.OrdinalEncoder()
onehot = preprocessing.OneHotEncoder(handle_unknown="error", sparse=False)
rf_classifier = RandomForestClassifier(criterion='entropy')
dt_classifier = DecisionTreeClassifier(criterion='entropy', max_depth=3)

In [232]:
# ID categorical variables; and Transform 'sex' to 0 or 1
transform_columns = ['sex']
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']
# testing
pd.get_dummies(df[transform_columns]).head()

Unnamed: 0,sex_ Female,sex_ Male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


In [233]:
# TRAINING SET: Remove categorical variables and encode salary
x = df.copy()


#x = pd.concat([x.drop(non_num_columns, axis=1), pd.get_dummies(df[transform_columns])], axis=1,)

x = x.drop(non_num_columns, axis=1)

x["salary"] = enc.fit_transform(df[["salary"]])

In [234]:
x.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary
0,39,77516,13,2174,0,40,0.0
1,50,83311,13,0,0,13,0.0
2,38,215646,9,0,0,40,0.0
3,53,234721,7,0,0,40,0.0
4,28,338409,13,0,0,40,0.0


In [235]:
# TESTING SET: Remove categorical variables and encode salary
xt = golden.copy()

#xt = pd.concat([xt.drop(non_num_columns, axis=1), pd.get_dummies(golden[transform_columns])], axis=1,)

xt = xt.drop(non_num_columns, axis=1)

xt["salary"] = enc.fit_transform(golden[["salary"]])

In [236]:
# Fit the models: Decision Tree & Random Forest
dt_classifier.fit(x.drop(['salary','fnlwgt'], axis=1), x.salary)
rf_classifier.fit(x.drop(['salary','fnlwgt'], axis=1), x.salary)

RandomForestClassifier(criterion='entropy')

In [237]:
# Predictions
pred_dt = dt_classifier.predict(xt.drop(['salary','fnlwgt'], axis=1))
pred_rf = rf_classifier.predict(xt.drop(['salary','fnlwgt'], axis=1))

In [238]:
# Evalute Decision Tree vs. Random Forest on TEST set

print("Decision Tree Accuracy Score:",accuracy_score(xt.salary, pred_dt))
print("Random Tree Accuracy Score:",accuracy_score(xt.salary, pred_rf))

Decision Tree Accuracy Score: 0.8031447699772741
Random Tree Accuracy Score: 0.8224924758921442


In [239]:
print("Decision Tree Confusion Matrix:")
print(confusion_matrix(xt.salary, pred_dt))
print("Random Tree Confusion Matrix:")
print(confusion_matrix(xt.salary, pred_rf))

Decision Tree Confusion Matrix:
[[12428     7]
 [ 3198   648]]
Random Tree Confusion Matrix:
[[11559   876]
 [ 2014  1832]]


In [240]:
print("Decision Tree Classification report:")
print(classification_report(xt.salary, pred_dt))
print("Random Tree Classification report:")
print(classification_report(xt.salary, pred_rf))

Decision Tree Classification report:
              precision    recall  f1-score   support

         0.0       0.80      1.00      0.89     12435
         1.0       0.99      0.17      0.29      3846

    accuracy                           0.80     16281
   macro avg       0.89      0.58      0.59     16281
weighted avg       0.84      0.80      0.74     16281

Random Tree Classification report:
              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89     12435
         1.0       0.68      0.48      0.56      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.70      0.72     16281
weighted avg       0.81      0.82      0.81     16281



# 2. Use a RandomForest or DecisionTree and the `adult` dataset, systematically add new columns, one by one, that are non-numerical but converted using the feature-extraction techniques we learned. Using the golden-test set show [`precision`, `recall`, `f1`, `confusion matrix`] for each additional feature added.

In [241]:
# Columns we want to transform
transform_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']


#Columns we can't use because non-numerical
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

## Check for discrepancies between training and test datasets, then fix

In [259]:
# TRAINING SET

x = df.copy()

#transformed = pd.get_dummies(x[transform_columns])

#onehot = preprocessing.OneHotEncoder(handle_unknown="infrequent_if_exist", sparse=False).fit(df[transform_columns])

onehot = preprocessing.OneHotEncoder(drop = 'first', handle_unknown="ignore", sparse=False).fit(x[transform_columns])

enc = preprocessing.OrdinalEncoder()

enc.fit(x[["salary"]])

transformed = onehot.transform(x[transform_columns])
#new_cols = list(onehot.categories_[0].flatten())
# Flatten all arrays in onehot.categories_
new_cols = [category for categories in onehot.categories_ for category in categories[1:]]
x_trans = pd.DataFrame(transformed, columns=new_cols)

x = pd.concat(
    [
        x.drop(non_num_columns, axis=1), 
        x_trans
    ], 
    axis=1,)

x["salary"] = enc.transform(x[["salary"]])


In [260]:
# TEST SET

xt = golden.copy()

#xt_transformed = pd.get_dummies(xt[transform_columns])
#onehot = preprocessing.OneHotEncoder(handle_unknown="infrequent_if_exist", sparse=False).fit(df[transform_columns])

xt_onehot = preprocessing.OneHotEncoder(drop = 'first', handle_unknown="ignore", sparse=False).fit(xt[transform_columns])

xt_enc = preprocessing.OrdinalEncoder()

xt_enc.fit(xt[["salary"]])

xt_transformed = xt_onehot.transform(xt[transform_columns])
#new_cols = list(onehot.categories_[0].flatten())
# Flatten all arrays in onehot.categories_
xt_new_cols = [category for categories in xt_onehot.categories_ for category in categories[1:]]
xt_trans = pd.DataFrame(xt_transformed, columns=xt_new_cols)


xt = pd.concat(
    [
        xt.drop(non_num_columns, axis=1), 
        xt_trans
    ], 
    axis=1,)

xt["salary"] = xt_enc.transform(xt[["salary"]])

In [278]:
x = df.copy()
#x['native-country'].unique()
(x['native-country'] == ' Holand-Netherlands').sum()

1

In [244]:
x.shape, xt.shape

((32561, 101), (16281, 100))

In [245]:
set(x.columns)-set(xt.columns)

{' Holand-Netherlands'}

## Loop through the Training/Test dataset to add 1 categorical feature for each iteration, format, Fit, Transform, Predict and Evaluate the Models: Decision Tree & Random Forest

In [277]:
# Initialize models
dt_classifier = DecisionTreeClassifier()
rf_classifier = RandomForestClassifier()

# Columns to transform and the non-numerical columns
transform_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
non_num_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

# TRAINING SET
x = df.copy() 
x = x[x['native-country'] != ' Holand-Netherlands'].reset_index(drop=True) # remove Holand since it doesnt exist in test set
x['salary'] = x['salary'].str.replace('.', '').str.strip()  # Clean/standardize format of 'salary' 

# TEST SET
xt = golden.copy()  
xt['salary'] = xt['salary'].str.replace('.', '').str.strip()  

# Encode target variable
enc = OrdinalEncoder()
x["salary_encoded"] = enc.fit_transform(x[["salary"]])
xt["salary_encoded"] = enc.transform(xt[["salary"]])

# Fit OneHotEncoder on all categorical columns
onehot = OneHotEncoder(drop='first', handle_unknown="ignore", sparse=False) 
onehot.fit(pd.concat([x[transform_columns], xt[transform_columns]], axis=0))  # Combine to ensure all categories are captured

# Transform all categorical features at once for both sets
x_transformed = pd.DataFrame(onehot.transform(x[transform_columns]), columns=onehot.get_feature_names_out(), index=x.index)
xt_transformed = pd.DataFrame(onehot.transform(xt[transform_columns]), columns=onehot.get_feature_names_out(), index=xt.index)

# Iterate over the number of categorical variables, adding one at a time
for i in range(1, len(transform_columns) + 1):
    # Determine the original columns to include up to this iteration
    included_columns = transform_columns[:i]

    # Find the corresponding new columns from the transformation
    included_new_cols = [col for col in x_transformed.columns if col.split('_')[0] in included_columns]

    # Construct the final datasets for this iteration
    x_final = pd.concat([x.drop(non_num_columns + ['salary', 'fnlwgt'], axis=1), x_transformed[included_new_cols]], axis=1)
    xt_final = pd.concat([xt.drop(non_num_columns + ['salary', 'fnlwgt'], axis=1), xt_transformed[included_new_cols]], axis=1)

    # Fit the models
    dt_classifier.fit(x_final.drop(['salary_encoded'], axis=1), x_final['salary_encoded'])
    rf_classifier.fit(x_final.drop(['salary_encoded'], axis=1), x_final['salary_encoded'])

    # Make predictions
    pred_dt = dt_classifier.predict(xt_final.drop(['salary_encoded'], axis=1))
    pred_rf = rf_classifier.predict(xt_final.drop(['salary_encoded'], axis=1))

    # Evaluate and print the results
    print(f"Iteration {i}: Using numerical columns and {included_columns}")
    print("Decision Tree Accuracy Score:", accuracy_score(xt_final['salary_encoded'], pred_dt))
    print("Random Forest Accuracy Score:", accuracy_score(xt_final['salary_encoded'], pred_rf))
    print("\nDecision Tree Confusion Matrix:")
    print(confusion_matrix(xt_final['salary_encoded'], pred_dt))
    print("Random Forest Confusion Matrix:")
    print(confusion_matrix(xt_final['salary_encoded'], pred_rf))
    print("\nDecision Tree Classification Report:")
    print(classification_report(xt_final['salary_encoded'], pred_dt))
    print("Random Forest Classification Report:")
    print(classification_report(xt_final['salary_encoded'], pred_rf))
    print("--------------------------------------------------\n")


Iteration 1: Using numerical columns and ['workclass']
Decision Tree Accuracy Score: 0.8113752226521712
Random Forest Accuracy Score: 0.8202198882132548

Decision Tree Confusion Matrix:
[[11378  1057]
 [ 2014  1832]]
Random Forest Confusion Matrix:
[[11468   967]
 [ 1960  1886]]

Decision Tree Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.91      0.88     12435
         1.0       0.63      0.48      0.54      3846

    accuracy                           0.81     16281
   macro avg       0.74      0.70      0.71     16281
weighted avg       0.80      0.81      0.80     16281

Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.92      0.89     12435
         1.0       0.66      0.49      0.56      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.71      0.72     16281
weighted avg       0.81      0.82      0.81   