In [16]:
from ml.data import process_data
from ml.model import train_model, compute_model_metrics, inference
from sklearn.model_selection import train_test_split
from joblib import dump, load

# Add the necessary imports for the starter code.
import os
import sys
import pandas as pd

file_dir = os.path.dirname(os.path.abspath("__file__"))
sys.path.insert(0, file_dir)

In [17]:
data = pd.read_csv(file_dir + '/../data/clean_census.csv')

In [18]:
data.columns

Index(['age', 'workclass', 'fnlgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [35]:
data.head(5)
data['workclass'].unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

In [20]:
train, test = train_test_split(data, test_size=0.20)

In [21]:
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

In [22]:
X_train, y_train, encoder, lb = process_data(
    train, categorical_features=cat_features, label="salary", training=True
)

In [23]:
X_test, y_test, encoder, lb = process_data(
    test, categorical_features=cat_features, label="salary", training=False, encoder=encoder, lb=lb
)

In [24]:
rf_model = train_model(X_train, y_train)

In [25]:
# from sklearn.utils import estimator_checks

# estimator_checks.check_estimator(rf_model)
rf_model.__class__.__name__

'RandomForestClassifier'

In [26]:
y= [1, 1, 0, 0]
predicts =  [0, 1,1, 0]
precision, recall, fbeta = compute_model_metrics(y, predicts)
print(precision, recall, fbeta)

0.5 0.5 0.5


In [27]:
train, test = train_test_split(data, test_size=0.20)

X, y, _, _ = process_data(
    train, categorical_features=cat_features, label="salary", training=True
)
assert len(X) == len(y)

In [28]:
train, test = train_test_split(data, test_size=0.20)
from sklearn.ensemble import RandomForestClassifier
X, y, _, _ = process_data(
    train, categorical_features=cat_features, label="salary", training=True
)
model = train_model(X, y)
# Check if this is a classification model
assert isinstance(model, RandomForestClassifier)

In [29]:
y = [1, 1, 0, 0]
predicts = [0, 1, 1, 0]
precision, recall, fbeta = compute_model_metrics(y, predicts)
print(precision, recall, fbeta)
assert abs(precision - 0.5) < 0.01 and abs(recall -
                                            0.5) < 0.01 and abs(fbeta - 0.5) < 0.01

0.5 0.5 0.5


In [30]:
X, y, _, _ = process_data(
        data, categorical_features=cat_features, label="salary", training=True
    )
model = train_model(X, y)
pred = inference(model, X)
# Check if pred.shape is similar to y.shape
assert y.shape == pred.shape

In [31]:
y_pred = inference(model, X_test)
precision, recall, fbeta = compute_model_metrics(y_test, y_pred)
print(
    f"Precision: {precision: .1f}, Recall: {recall: .1f}, Fbeta: {fbeta: .1f}")

Precision:  1.0, Recall:  1.0, Fbeta:  1.0


In [36]:
def slice_census(cat_features):
    file_dir = os.path.dirname(os.path.abspath("__file__"))
    sys.path.insert(0, file_dir)

    data = pd.read_csv(file_dir + "/../data/clean_census.csv")
    model = joblib.load(file_dir + "/../model/census_rfmodel.pkl")
    encoder = joblib.load(file_dir + "/../model/census_encoder.pkl")
    lb = joblib.load(file_dir + "/../model/census_lb.pkl")

    os.makedirs(file_dir + "/../slice_metrics", exist_ok=True)
    flie = open(file_dir + "/../slice_metrics/slice_output.txt", "w")
    for slice_feature in cat_features:
        for elem in data[slice_feature].unique():
            data_temp = data[data[slice_feature] == elem]
            X, y, _, _ = process_data(
                data_temp,
                categorical_features=cat_features,
                label="salary",
                training=False,
                encoder=encoder,
                lb=lb,
            )
            predicts = inference(model, X)
            precision, recall, fbeta = compute_model_metrics(y, predicts)
            txt_line = f"{slice_feature} - {elem}: Precision: {precision: .2f}. Recall: {recall: .2f}. Fbeta: {fbeta: .2f}\n"
            flie.write(txt_line)
    flie.close()

In [37]:
slice_census(cat_features)