In [9]:
import pickle
from sklearn.metrics import fbeta_score, precision_score, recall_score
from data import process_data
import pandas as pd
import numpy as np
import sys
from sklearn.ensemble import RandomForestClassifier

In [10]:
df = pd.read_csv('../data/census.csv')
df.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [11]:
# prep df for test functions
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 
                       'relationship', 'race', 'sex', 'native-country']

X_processed, y, encoder, lb = process_data(
    df, 
    categorical_features=categorical_features,
    label='salary',  # or whatever your target column is called
    training=True
)

# 3. Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

In [12]:
# Optional: implement hyperparameter tuning.
def train_model(X_train, y_train):
    """
    Trains a random forest machine learning model and returns it.

    Inputs
    ------
    X_train : np.array
        Training data.
    y_train : np.array
        Labels.
    Returns
    -------
    model
        Trained machine learning model.
    """
    model = RandomForestClassifier(
        n_estimators=100,
        random_state=39,
        n_jobs=-1
        )
    model.fit(X_train, y_train)
    return model

In [13]:
# 4. Test your function
print("Training model...")
model = train_model(X_train, y_train)
print(f"Model trained! Type: {type(model)}")

# 5. Quick validation
predictions = model.predict(X_test)
accuracy = (predictions == y_test).mean()
print(f"Test accuracy: {accuracy:.3f}")

Training model...
Model trained! Type: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Test accuracy: 0.859


In [15]:
def inference(model, X):
    """ Run model inferences and return the predictions.

    Inputs
    ------
    model : ???
        Trained machine learning model.
    X : np.array
        Data used for prediction.
    Returns
    -------
    preds : np.array
        Predictions from the model.
    """
    preds = model.predict(X)
    return preds

In [16]:
def save_model(model, path):
    """ Serializes model to a file.

    Inputs
    ------
    model
        Trained machine learning model or OneHotEncoder.
    path : str
        Path to save pickle file.
    """
    with open(path, 'wb') as f:
        pickle.dump(model, f)