# "Adult" dataset
Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset.
https://archive.ics.uci.edu/dataset/2/adult

### Loading the dataset
We also create a data object from dice where we need to specify wether the data is continuous or discrete to be able to do perturbations later

In [3]:
import dice_ml
from dice_ml.utils import helpers
import pandas as pd
dataset = helpers.load_adult_income_dataset()
dataset.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,28,Private,Bachelors,Single,White-Collar,White,Female,60,0
1,30,Self-Employed,Assoc,Married,Professional,White,Male,65,1
2,32,Private,Some-college,Married,White-Collar,White,Male,50,0
3,20,Private,Some-college,Single,Service,White,Female,35,0
4,41,Self-Employed,Some-college,Married,White-Collar,White,Male,50,0


In [4]:
# description of transformed features
adult_info = helpers.get_adult_data_info()
adult_info

{'age': 'age',
 'workclass': 'type of industry (Government, Other/Unknown, Private, Self-Employed)',
 'education': 'education level (Assoc, Bachelors, Doctorate, HS-grad, Masters, Prof-school, School, Some-college)',
 'marital_status': 'marital status (Divorced, Married, Separated, Single, Widowed)',
 'occupation': 'occupation (Blue-Collar, Other/Unknown, Professional, Sales, Service, White-Collar)',
 'race': 'white or other race?',
 'gender': 'male or female?',
 'hours_per_week': 'total work hours per week',
 'income': '0 (<=50K) vs 1 (>50K)'}

# Train a Model

In [5]:
from sklearn.model_selection import train_test_split
import random
random.seed(42)
target = dataset["income"]
train_dataset, test_dataset, y_train, y_test = train_test_split(dataset,
                                                                target,
                                                                test_size=0.2,
                                                                random_state=0,
                                                                stratify=target)
x_train = train_dataset.drop('income', axis=1)
x_test = test_dataset.drop('income', axis=1)


In [6]:
# Step 1: dice_ml.Data
d = dice_ml.Data(dataframe=train_dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income')

In [7]:
from pathlib import Path
directory = Path("./data")
# Create the directory if it does not exist
directory.mkdir(parents=True, exist_ok=True)
train_dataset.to_csv('./data/adult_train_dataset.csv', index=False)
test_dataset.to_csv('./data/adult_test_dataset.csv', index=False)

### Loading the model

In [8]:
# Sklearn imports
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

numerical = ["age", "hours_per_week"]
categorical = x_train.columns.difference(numerical)

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestClassifier())])
model = clf.fit(x_train, y_train)
model

In [9]:
import pickle
from pathlib import Path
directory = Path("./models")
# Create the directory if it does not exist
directory.mkdir(parents=True, exist_ok=True)


# Open the file in binary write mode and save the object
with open('./models/loan_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Create a Dataset to Test
We will only use individual who are predicted in the negative class in the test dataset

In [11]:
test_df = pd.read_csv('./data/adult_test_dataset.csv')
test_df['predictions'] = model.predict(test_df.drop(axis=1,columns = ['income']))
test_df = test_df[test_df['predictions'] == 0]
test_df.drop(axis=1,columns = ['predictions'], inplace=True)
test_df.to_csv('./data/test_examples.csv',index=False)