In [1]:
import joblib
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, Normalizer
from sklearn.metrics import fbeta_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv("../data/clean.csv")

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

cont_features = [
    "age",
    "fnlgt",
    "education-num",
    "capital-gain",
    "capital-loss",
    "hours-per-week"
]

In [5]:
y = data["salary"]
X_categorical = data[cat_features].values
X_continuous = data[cont_features].values


## Data preprocessing

In [6]:
encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
lb = LabelBinarizer()
normalizer = Normalizer()

X_categorical = encoder.fit_transform(X_categorical)

X_continuous = normalizer.fit_transform(X_continuous)

y = lb.fit_transform(y.values).ravel()

X = np.concatenate([X_continuous, X_categorical], axis=1)

In [7]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42)

## Baseline model

In [8]:
model = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=42)
model.fit(train_X, train_y)

#model = LogisticRegression(solver='lbfgs', max_iter=1000)
#model.fit(train_X, train_y)

RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42)

In [9]:
preds = model.predict(val_X)

In [10]:
def model_scores(val_y, preds):

    fbeta = fbeta_score(val_y, preds, beta=1, zero_division=1)
    precision = precision_score(val_y, preds, zero_division=1)
    recall = recall_score(val_y, preds, zero_division=1)

    return fbeta, precision, recall

In [11]:
fbeta, precision, recall = model_scores(val_y, preds)

print("Fbeta score: ", fbeta)
print("Precision: ", precision)
print("Recall: ", recall)

Fbeta score:  0.35942028985507246
Precision:  0.889344262295082
Recall:  0.2252205500778412


## Hyperparameter tunning

In [12]:
param_grid = { 
    'n_estimators': [10, 20, 30],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4, 5, 6, 7, 8],
    'criterion' :['gini', 'entropy']
}

model = RandomForestClassifier()

grid_search = GridSearchCV(model, param_grid, cv=5,
)

grid_search.fit(train_X, train_y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [10, 20, 30]})

In [13]:
best_params = grid_search.best_params_

In [14]:
best_params

{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 20}

In [15]:
try:
    joblib.dump(
            grid_search.best_estimator_,
            "../model/" +
            'model.pkl')

except Exception as err:
    print("Error while saving model")

In [16]:
model_loaded = joblib.load("../model/" +
            'model.pkl')

preds = model_loaded.predict(val_X)

fbeta, precision, recall = model_scores(val_y, preds)

print("Fbeta score: ", fbeta)
print("Precision: ", precision)
print("Recall: ", recall)

Fbeta score:  0.581532416502947
Precision:  0.7879325643300799
Recall:  0.46081992734820965


## Create dataset for testing

In [19]:
test = pd.read_csv("../data/clean.csv")

In [20]:
test_df = test.sample(frac=0.2)

In [22]:
test_df.drop("salary", axis=1, inplace=True)

In [24]:
test_df.to_csv("../data/test.csv", index=False)