# Train and Deploy an Income Predictor Model Using Flask

You are working for a governmental agency and you have been tasked to build and deploy a predictive model using historical census data. The objective of this model is to assess whether a person is more likely to have a salary over or under 50k by looking at their personal information.

In [25]:
import joblib
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pd.read_csv('../Dataset/phpMawTba.csv')
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,Male,0,0,20,United-States,<=50K


In [9]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 123)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(34189, 13)
(14653, 13)
(34189,)
(14653,)


In [32]:
joblib.dump(X_test, 'X_test.gz')
joblib.dump(y_test, 'y_test.gz')

[&#39;y_test.gz&#39;]

### Extract the list of categories for each categorical column.

In [12]:
X_train.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object

In [15]:
cat_cols = X_train.select_dtypes(include=[np.object]).columns
for col in cat_cols:
    X_train[col] = X_train[col].astype('category')
X_train.dtypes

age                  int64
workclass         category
fnlwgt               int64
education         category
education-num        int64
marital-status    category
occupation        category
relationship      category
sex               category
capital-gain         int64
capital-loss         int64
hours-per-week       int64
native-country    category
dtype: object

In [22]:
column_categories = {}
for col in cat_cols:
    column_categories[col] = X_train[col].cat.categories
# save
joblib.dump(column_categories, 'column_categories.gz')

[&#39;column_categories.gz&#39;]

In [24]:
# one-hot encoding
X_train = pd.get_dummies(X_train, columns=cat_cols)
X_train

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
7916,18,152508,7,0,0,20,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
26447,55,136819,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
20889,43,191149,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
30145,44,241851,11,4386,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
7473,41,369781,9,0,0,55,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7763,42,37997,13,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
15377,65,326936,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
17730,44,229466,10,0,0,50,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
28030,35,265954,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [26]:
# train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [30]:
# save model
joblib.dump(model, 'model.gz')

[&#39;model.gz&#39;]