In [1]:
BUCKET_NAME = "qwiklabs-gcp-01-181c1fcfd3d7" #@param {type:"string"}
REGION = "us-east1" #@param {type:"string"}

In [2]:
# Creating a bucket

! gsutil mb -l $REGION gs://$BUCKET_NAME

Creating gs://qwiklabs-gcp-01-181c1fcfd3d7/...


In [2]:
# Test access to the bucket

! gsutil ls -al gs://$BUCKET_NAME

In [3]:
# Create SQL query using natality data after the year 2000
query = """
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks,
  FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING))) AS hashmonth
FROM
  publicdata.samples.natality
WHERE year > 2000
"""

In [4]:
# Call BigQuery and examine in dataframe
from google.cloud import bigquery
df = bigquery.Client().query(query + " LIMIT 100").to_dataframe()
df.head()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks,hashmonth
0,6.426475,False,26,1,38.0,-1403073183891835564
1,6.181762,False,30,1,38.0,1088037545023002395
2,8.750147,True,35,1,40.0,-1403073183891835564
3,6.812284,False,31,1,40.0,7108882242435606404
4,7.500126,False,30,1,38.0,-7146494315947640619


In [7]:
import pandas as pd



# Get one hot encoding of columns B
one_hot = pd.get_dummies(df['is_male'])
# Drop column B as it is now encoded
df = df.drop('is_male',axis = 1)
# Join the encoded df
df = df.join(one_hot)

In [30]:
df.head()

Unnamed: 0,weight_pounds,mother_age,plurality,gestation_weeks,hashmonth,False,True
0,7.063611,32,1,37.0,7108882242435606404,0,1
1,4.687028,30,3,33.0,-7170969733900686954,0,1
2,7.561856,20,1,39.0,6392072535155213407,0,1
3,7.561856,31,1,37.0,-2126480030009879160,0,1
4,7.312733,32,1,40.0,3408502330831153141,0,1


In [5]:
%%writefile preprocess.py
import numpy as np
import pandas as pd

class MySimpleScaler(object):
 

 def preprocess(self, data):
    data = data[data.weight_pounds  > 0]
    data = data[data.mother_age  > 0]
    data = data[data.plurality > 0]
    data = data[data.gestation_weeks > 0]
    print(data.shape)

    x_cols = ['mother_age', 'plurality', 'gestation_weeks', True,False]
    # Get one hot encoding of columns B
    one_hot = pd.get_dummies(data['is_male'])
    # Drop column B as it is now encoded
    data = data.drop('is_male',axis = 1)
    # Join the encoded df
    data = data.join(one_hot)


    return data[x_cols],data['weight_pounds']

Overwriting preprocess.py


In [37]:

df = df[df.mother_age  > 0]
df = df[df.plurality > 0]
df = df[df.gestation_weeks > 0]



In [6]:
df['weight_pounds']

0     6.426475
1     6.181762
2     8.750147
3     6.812284
4     7.500126
        ...   
95    7.837433
96    7.561856
97    5.875319
98    5.577695
99    8.811877
Name: weight_pounds, Length: 100, dtype: float64

In [7]:
import pickle


from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

from preprocess import MySimpleScaler


scaler = MySimpleScaler()
X,y = scaler.preprocess(df)
#print(data_new)

model = RandomForestRegressor(max_depth=2, random_state=0)
model.fit(X, y)

(98, 6)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [8]:
print(model.feature_importances_)
print(model.score(X,y))

[0.15299272 0.         0.84123899 0.         0.00576829]
0.38437848273256703


In [9]:
X.shape

(98, 5)

In [10]:
joblib.dump(model, 'model.joblib')
with open ('preprocessor.pkl', 'wb') as f:
  pickle.dump(scaler, f)

In [None]:
Deploying a custom prediction routine
To deploy a custom prediction routine to serve predictions from your trained model, do the following:

Create a custom predictor to handle requests
Package your predictor and your preprocessing module
Upload your model artifacts and your custom code to Cloud Storage
Deploy your custom prediction routine to AI Platform

In [None]:
%%writefile predictor.py
import os
import pickle

import numpy as np
from sklearn.datasets import load_iris
from sklearn.externals import joblib

class MyPredictor(object):
  def __init__(self, model, preprocessor):
    self._model = model
    self._preprocessor = preprocessor
    self._class_names = load_iris().target_names

  def predict(self, instances, **kwargs):
    inputs = np.asarray(instances)
    preprocessed_inputs = self._preprocessor.preprocess(inputs)
    if kwargs.get('probabilities'):
      probabilities = self._model.predict_proba(preprocessed_inputs)
      return probabilities.tolist()
    else:
      outputs = self._model.predict(preprocessed_inputs)
      return [self._class_names[class_num] for class_num in outputs]

  @classmethod
  def from_path(cls, model_dir):
    model_path = os.path.join(model_dir, 'model.joblib')
    model = joblib.load(model_path)

    preprocessor_path = os.path.join(model_dir, 'preprocessor.pkl')
    with open(preprocessor_path, 'rb') as f:
      preprocessor = pickle.load(f)

    return cls(model, preprocessor)