In [2]:
from sklearn.model_selection import train_test_split

# Add the necessary imports for the starter code.
import pandas as pd
from model import train_model, compute_model_metrics, inference
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
import numpy as np

In [3]:
DATA_PATH = '../data/census.csv'

def load_data(path):
    """
    Load data
    Inputs
    ------
    path :  str
            data path
    Returns
    -------
    df : cleaned pd.dataframe
         dataframe
    """
    df =  pd.read_csv(path, skipinitialspace=True)
    clean_df = (df.replace("?", None).dropna())
    return clean_df

def process_data(
    X, categorical_features=[], label=None, training=True
):
    """ Process the data that will be used in the pipeline.
    Inputs
    ------
    X : pd.DataFrame
        Dataframe
    categorical_features: list[str]
        List categorical features
    label : str
        Label column
        for y (default=None)
    training : bool
        Indicator if process is in training mode or inference mode
    Returns
    -------
    X : np.array
        Processed data
    y : np.array
        Processed labels
    encoder : OneHotEncoder
        Trained OneHotEncoder
    lb : LabelBinarizer
        Trained LabelBinarizer
    """
    encoder = None
    lb = None
    if label:
        y = X[label]
        X = X.drop([label], axis=1)
    else:
        y = None
    X_categorical = X[categorical_features].values
    
    X_continuous = X.drop(*[categorical_features], axis=1)
    
    if training:
        encoder = OneHotEncoder(handle_unknown="ignore")
        lb = LabelBinarizer()
        X_categorical = encoder.fit_transform(X_categorical).toarray()
        y = lb.fit_transform(y.values).reshape(-1)
    else:
        X_categorical = encoder.transform(X_categorical)
        try:
            y = lb.transform(y.values).reshape(-1)
        except:
            print("error")
    X = np.concatenate([X_continuous, X_categorical], axis=1)
    return X, y, encoder, lb

data = load_data(DATA_PATH)

In [4]:
data.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
train, test = train_test_split(data, test_size=0.20)

In [26]:
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]
X_train, y_train, encoder, lb = process_data(
    train, categorical_features=cat_features, label="salary", training=True
)

  (0, 2)	1.0
  (0, 22)	1.0
  (0, 25)	1.0
  (0, 35)	1.0
  (0, 44)	1.0
  (0, 54)	1.0
  (0, 56)	1.0
  (0, 95)	1.0
  (1, 2)	1.0
  (1, 16)	1.0
  (1, 27)	1.0
  (1, 39)	1.0
  (1, 45)	1.0
  (1, 54)	1.0
  (1, 56)	1.0
  (1, 95)	1.0
  (2, 2)	1.0
  (2, 11)	1.0
  (2, 27)	1.0
  (2, 36)	1.0
  (2, 48)	1.0
  (2, 54)	1.0
  (2, 55)	1.0
  (2, 82)	1.0
  (3, 2)	1.0
  :	:
  (24125, 95)	1.0
  (24126, 2)	1.0
  (24126, 22)	1.0
  (24126, 25)	1.0
  (24126, 41)	1.0
  (24126, 44)	1.0
  (24126, 54)	1.0
  (24126, 56)	1.0
  (24126, 95)	1.0
  (24127, 2)	1.0
  (24127, 18)	1.0
  (24127, 23)	1.0
  (24127, 33)	1.0
  (24127, 45)	1.0
  (24127, 54)	1.0
  (24127, 55)	1.0
  (24127, 95)	1.0
  (24128, 2)	1.0
  (24128, 18)	1.0
  (24128, 23)	1.0
  (24128, 35)	1.0
  (24128, 45)	1.0
  (24128, 52)	1.0
  (24128, 56)	1.0
  (24128, 95)	1.0
(24129, 6)


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 0 dimension(s)

In [24]:
y_train

array([1, 0, 0, ..., 0, 0, 0])