In [3]:
BUCKET_NAME = "qwiklabs-gcp-01-802a8aea86c1" #@param {type:"string"}
REGION = "us-east1" #@param {type:"string"}

In [4]:
# Creating a bucket

! gsutil mb -l $REGION gs://$BUCKET_NAME

Creating gs://qwiklabs-gcp-01-802a8aea86c1/...


In [5]:
# Test access to the bucket

! gsutil ls -al gs://$BUCKET_NAME

In [6]:
# Create SQL query using natality data after the year 2000
query = """
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks,
  FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING))) AS hashmonth
FROM
  publicdata.samples.natality
WHERE year > 2000
"""

In [28]:
# Call BigQuery and examine in dataframe
from google.cloud import bigquery
df = bigquery.Client().query(query + " LIMIT 100").to_dataframe()
df.head()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks,hashmonth
0,6.999677,False,33,1,38.0,-2126480030009879160
1,6.768191,False,22,1,36.0,8904940584331855459
2,7.62579,False,32,1,41.0,-2126480030009879160
3,6.563162,True,30,1,40.0,-7146494315947640619
4,6.563162,False,35,1,37.0,5896567601480310696


In [29]:
import pandas as pd



# Get one hot encoding of columns B
one_hot = pd.get_dummies(df['is_male'])
# Drop column B as it is now encoded
df = df.drop('is_male',axis = 1)
# Join the encoded df
df = df.join(one_hot)

In [30]:
df.head()

Unnamed: 0,weight_pounds,mother_age,plurality,gestation_weeks,hashmonth,False,True
0,7.063611,32,1,37.0,7108882242435606404,0,1
1,4.687028,30,3,33.0,-7170969733900686954,0,1
2,7.561856,20,1,39.0,6392072535155213407,0,1
3,7.561856,31,1,37.0,-2126480030009879160,0,1
4,7.312733,32,1,40.0,3408502330831153141,0,1


In [39]:
%%writefile preprocess.py
import numpy as np
import pandas as pd

class MySimpleScaler(object):
 

 def preprocess(self, data):
    
    data = data[data.weight_pounds > 0]
    data = data[data.mother_age  > 0]
    data = data[data.plurality > 0]
    data = data[data.gestation_weeks > 0]
    print(data.shape)

    
    # Get one hot encoding of columns B
    one_hot = pd.get_dummies(data['is_male'])
    # Drop column B as it is now encoded
    data = data.drop('is_male',axis = 1)
    # Join the encoded df
    data = data.join(one_hot)


    return data

Overwriting preprocess.py


In [37]:

df = df[df.mother_age  > 0]
df = df[df.plurality > 0]
df = df[df.gestation_weeks > 0]



In [38]:
df.shape

(98, 6)

In [40]:
import pickle


from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

from preprocess import MySimpleScaler

x_cols = ['mother_age', 'plurality', 'gestation_weeks', 'is_male']
scaler = MySimpleScaler()
X = scaler.preprocess(df[x_cols])
y = df[df['weight_pounds']>0]

model = RandomForestRegressor(max_depth=2, random_state=0)
model.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [19]:
print(model.feature_importances_)
print(model.score(X,y))

[0.04592338 0.         0.94942341 0.         0.00465321]
0.5597624563862029


In [41]:
X.shape

(98, 5)