In [6]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import joblib

In [None]:
# Load Dataset
df = pd.read_csv('https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv', skipinitialspace=True)

In [19]:
x_cols = [col for col in df.columns if col != "income"]

#Separate input and output
X = df[x_cols]
y = df["income"]

# data split train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1024)

In [31]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
13683,32,Private,295282,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,48,United-States
11378,20,Private,102607,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,30,United-States
25351,55,Private,189528,5th-6th,3,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
13465,29,Private,116662,Bachelors,13,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
20227,40,,65545,Masters,14,Divorced,,Own-child,White,Female,0,0,55,United-States


In [32]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22792 entries, 13683 to 6075
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             22792 non-null  int64 
 1   workclass       21521 non-null  object
 2   fnlwgt          22792 non-null  int64 
 3   education       22792 non-null  object
 4   education-num   22792 non-null  int64 
 5   marital-status  22792 non-null  object
 6   occupation      21518 non-null  object
 7   relationship    22792 non-null  object
 8   race            22792 non-null  object
 9   sex             22792 non-null  object
 10  capital-gain    22792 non-null  int64 
 11  capital-loss    22792 non-null  int64 
 12  hours-per-week  22792 non-null  int64 
 13  native-country  22375 non-null  object
dtypes: int64(6), object(8)
memory usage: 2.6+ MB


In [34]:
train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)

In [57]:
encoders = {}
for column in [col for col in X_train.columns if type(X_train[col][0]) == str]:
    categorical_convert = LabelEncoder()
    X_train[column] = categorical_convert.fit_transform(X_train[column])
    encoders[column] = categorical_convert

In [59]:
#train the Random Forest Algorithm
rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(X_train, y_train)

In [60]:
#train the Extra Trees Algorithm
et = ExtraTreesClassifier(n_estimators = 100)
et = et.fit(X_train, y_train)

## Save pre-processing objects and RF & ET Algorith models
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)