## Import packages

In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from google.cloud import bigquery

## Explore data

In [2]:
query="""
SELECT
    weight_pounds
    , is_male
    , mother_age
    , plurality
    , gestation_weeks
FROM
    publicdata.samples.natality
WHERE
    year > 2000
LIMIT
    10000
"""
df = bigquery.Client().query(query).to_dataframe()
df.head()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,6.68662,True,18,1,43.0
1,9.360828,True,32,1,41.0
2,8.437091,False,30,1,39.0
3,6.124442,False,24,1,40.0
4,7.12534,False,26,1,41.0


In [3]:
df.describe()

Unnamed: 0,weight_pounds,mother_age,plurality,gestation_weeks
count,9989.0,10000.0,10000.0,9890.0
mean,7.297602,27.2989,1.0344,38.699798
std,1.291685,6.165838,0.192926,2.539957
min,0.612885,12.0,1.0,17.0
25%,6.624891,22.0,1.0,38.0
50%,7.374463,27.0,1.0,39.0
75%,8.124034,32.0,1.0,40.0
max,12.257702,50.0,3.0,47.0


In [4]:
df['is_male'].value_counts()

True     5150
False    4850
Name: is_male, dtype: int64

## Pre process data

In [5]:
df = df.dropna()
df = shuffle(df, random_state=2)

### Extract label

In [6]:
label = 'weight_pounds'
labels  = df[label]
data = df.drop(columns=[label])

In [7]:
data.head()

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks
39,True,32,1,41.0
6132,False,28,1,30.0
5986,False,44,1,38.0
7682,False,34,1,38.0
4910,True,31,1,40.0


### Convert categorical features

NOTE: not even bool types the model accepts.

In [8]:
data['is_male'] = data['is_male'].astype(int)

data.head()

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks
39,1,32,1,41.0
6132,0,28,1,30.0
5986,0,44,1,38.0
7682,0,34,1,38.0
4910,1,31,1,40.0


### Split train and test data sets

In [9]:
x, y = data, labels
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [10]:
x_train.head()

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks
4485,1,25,1,40.0
6657,1,21,1,40.0
5019,0,20,1,38.0
8836,1,30,1,38.0
6596,0,35,1,41.0


## Train model

### Define and train model

In [11]:
model = xgb.XGBRegressor(objective='reg:squarederror')

model.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [12]:
# Evalute model

y_pred = model.predict(x_test)

for i in range(20):
    print(f'Predicted weight: {y_pred[i]}')
    print(f'Actual weight: {y_test.iloc[i]}\n')

Predicted weight: 7.374887943267822
Actual weight: 6.8122838958

Predicted weight: 5.8887200355529785
Actual weight: 7.165023515

Predicted weight: 7.053300857543945
Actual weight: 6.0009827716399995

Predicted weight: 7.3420023918151855
Actual weight: 7.68751907594

Predicted weight: 7.754918575286865
Actual weight: 7.7492485093

Predicted weight: 7.2223052978515625
Actual weight: 8.000575487979999

Predicted weight: 7.939842224121094
Actual weight: 6.5477291814

Predicted weight: 7.7925286293029785
Actual weight: 7.5067400211

Predicted weight: 8.225926399230957
Actual weight: 7.31273323054

Predicted weight: 7.237839698791504
Actual weight: 7.50012615324

Predicted weight: 8.153484344482422
Actual weight: 8.6862131228

Predicted weight: 8.145565032958984
Actual weight: 8.437090766739999

Predicted weight: 7.846072673797607
Actual weight: 8.56275425608

Predicted weight: 7.836692810058594
Actual weight: 7.1209310625999995

Predicted weight: 7.605792999267578
Actual weight: 7.68310983

### Save model

In [13]:
model.save_model('model.bst')

## Deploy model to GCP

### Create Cloud Storage

In [43]:
GCP_PROJECT = 'dogwood-boulder-346222'
MODEL_BUCKET = 'gs://baby_weight_model_bucket'
VERSION_NAME = 'v2'
MODEL_NAME = 'baby_weight'

### Create bucket

In [16]:
!gsutil mb $MODEL_BUCKET

Creating gs://baby_weight_model_bucket/...


### Copy model file to Cloud Storage

In [17]:
!gsutil cp ./model.bst $MODEL_BUCKET

Copying file://./model.bst [Content-Type=application/octet-stream]...
/ [1 files][307.1 KiB/307.1 KiB]                                                
Operation completed over 1 objects/307.1 KiB.                                    


### Create and deploy model

In [20]:
# create model
!gcloud ai-platform models create $MODEL_NAME --region=global


Learn more about regional endpoints and see a list of available regions: https://cloud.google.com/ai-platform/prediction/docs/regional-endpoints
Using endpoint [https://ml.googleapis.com/]
Created ai platform model [projects/dogwood-boulder-346222/models/baby_weight].


In [34]:
%cat $xgb.VERSION_FILE

1.5.2


In [44]:
# deploy model

!gcloud ai-platform versions create $VERSION_NAME \
--model=$MODEL_NAME \
--region='global' \
--origin=$MODEL_BUCKET \
--framework='XGBOOST' \
--runtime-version=2.8 \
--python-version=3.7 \
--project=$GCP_PROJECT

Using endpoint [https://ml.googleapis.com/]
Creating version (this might take a few minutes)......done.                    


## Make prediction

In [37]:
%%writefile predictions.json
[0.0, 33.0, 1.0, 27.0]
[1.0, 26.0, 1.0, 40.0]

Writing predictions.json


In [63]:
prediction = !gcloud ai-platform predict --model=$MODEL_NAME --json-instances=predictions.json --version=$VERSION_NAME --region='global'
print(prediction.s)

Using endpoint [https://ml.googleapis.com/] [2.0147697925567627, 7.947142601013184]
