In [386]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder , LabelEncoder
from sklearn.metrics import classification_report , accuracy_score

In [387]:
data_train = pd.read_csv("Census_income_train.csv")
data_test = pd.read_csv("Census_income_test.csv")

In [388]:
clean_data = data_train[data_train['Workclass'].str.contains('\?') == False]
clean_data = clean_data[clean_data['Occupation'].str.contains('\?') == False]
clean_data = clean_data[clean_data['Native-country'].str.contains('\?') == False]
clean_data_train = clean_data.reset_index(drop=True)



clean_data = data_test[data_test['Workclass'].str.contains('\?') == False]
clean_data = clean_data[clean_data['Occupation'].str.contains('\?') == False]
clean_data = clean_data[clean_data['Native-country'].str.contains('\?') == False]
clean_data_test = clean_data.reset_index(drop=True)

In [389]:
clean_data_test['Income'] = clean_data_test['Income'].str.replace('.','',regex=False)

In [390]:
clean_data = pd.concat([clean_data_train , clean_data_test] , ignore_index=True)

In [391]:
clean_data

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45216,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K
45217,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
45218,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
45219,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


In [392]:
inputs = clean_data.iloc[:,:-1]
target = clean_data.iloc[:,-1]
x_train , x_test , y_train , y_test = train_test_split(inputs , target , test_size=0.2 ,random_state=365 , shuffle=True)

In [393]:
x_encoder = ColumnTransformer(transformers=[('label' ,OrdinalEncoder() , selector(dtype_include='object'))] ,
                              remainder='passthrough')
x_transformed_train = x_encoder.fit_transform(x_train)
x_transformed_test = x_encoder.transform(x_test)
label = LabelEncoder()

In [394]:
x_train_new = pd.DataFrame(x_transformed_train ,
                           columns= selector(dtype_include='object')(x_train)+list(selector(dtype_exclude='object')(x_train)))
x_test_new = pd.DataFrame(x_transformed_test , columns=selector(dtype_include='object')(x_test) + list(selector(dtype_exclude='object')(x_test)))

y_train_new = label.fit_transform(y_train)
y_test_new = label.transform(y_test)

In [395]:
cls = DecisionTreeClassifier(ccp_alpha=0.001)
cls.fit(x_train_new , y_train_new)

In [396]:
predicted_cls = cls.predict(x_test_new)
print(classification_report(y_test_new, predicted , zero_division=0))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90      6704
           1       0.80      0.55      0.65      2341

    accuracy                           0.85      9045
   macro avg       0.83      0.75      0.78      9045
weighted avg       0.84      0.85      0.84      9045



In [397]:
rfc = RandomForestClassifier(random_state=365 , n_estimators= 150 , ccp_alpha=0.0001)
rfc.fit(x_train_new , y_train_new)

In [398]:
predicted_rfc = rfc.predict(x_test_new)
print(classification_report(y_test_new,predicted_rfc))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91      6704
           1       0.80      0.62      0.70      2341

    accuracy                           0.86      9045
   macro avg       0.84      0.78      0.80      9045
weighted avg       0.86      0.86      0.86      9045

