# 5.1 Sklearn Workflow

In [1]:
%%file scripts/containers/sklearn_pipeline/pipeline.py
import os
import datetime
import pandas as pd
import numpy as np
import pandas_gbq as gbq
from google.oauth2 import service_account
from sklearn.linear_model import LogisticRegression


# fetch the data set and add IDs 
games_df = pd.read_csv("https://github.com/bgweber/Twitch/raw/master/Recommendations/games-expand.csv")
games_df['user_id'] = games_df.index 
games_df['new_user'] = np.random.choice([0, 1], size=len(games_df), p=[0.9, 0.1])

# train and test groups 
train = games_df.query("new_user == 0")
x_train = train.drop(columns=['label', 'user_id', 'new_user'])
y_train = train['label']
test = games_df.query("new_user == 1")
x_test = test.drop(columns=['label', 'user_id', 'new_user'])

# build a model
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict_proba(x_test)[:, 1]

# build a predictions data frame
result_df = pd.DataFrame({'user_id': test['user_id'], 
                          'pred': y_pred,
                          'time': str(datetime.datetime.now())})

# save predictions to BigQuery 
table_id = "dsp_demo.user_scores"
project_id = os.environ['GOOGLE_PROJECT_ID']
credentials = service_account.Credentials.from_service_account_file(
    os.environ['GOOGLE_APPLICATION_CREDENTIALS']
)
gbq.to_gbq(dataframe=result_df, 
           destination_table=table_id, 
           project_id=project_id, 
           if_exists='replace',
           credentials=credentials)

Writing scripts/containers/sklearn_pipeline/pipeline.py


In [2]:
import os
import pandas_gbq as gbq
from google.oauth2 import service_account


project_id = os.environ['GOOGLE_PROJECT_ID']
credentials = service_account.Credentials.from_service_account_file(
    os.environ['GOOGLE_APPLICATION_CREDENTIALS']
)

sql = "SELECT * FROM dsp_demo.user_scores"
df = gbq.read_gbq(query=sql, 
                  project_id=project_id, 
                  credentials=credentials)
df.head()

Downloading: 100%|██████████| 2304/2304 [00:00<00:00, 6526.72rows/s]


Unnamed: 0,user_id,pred,time
0,2659,0.054512,2020-08-03 07:35:44.238809
1,3470,0.054512,2020-08-03 07:35:44.238809
2,4680,0.054512,2020-08-03 07:35:44.238809
3,5841,0.054512,2020-08-03 07:35:44.238809
4,6881,0.054512,2020-08-03 07:35:44.238809


In [3]:
from google.cloud import bigquery

client = bigquery.Client()
sql = "select * from dsp_demo.user_scores"
client.query(sql).to_dataframe().head()

Unnamed: 0,user_id,pred,time
0,2659,0.054512,2020-08-03 07:35:44.238809
1,3470,0.054512,2020-08-03 07:35:44.238809
2,4680,0.054512,2020-08-03 07:35:44.238809
3,5841,0.054512,2020-08-03 07:35:44.238809
4,6881,0.054512,2020-08-03 07:35:44.238809


# 5.2 Cron

# 5.3 Workflow Tools

## Apache Airflow

In [1]:
%%file scripts/airflow/sklearn.py
import os
import datetime

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'Airflow',
    'depends_on_past': False,
    'email': os.environ['FAILURE_EMAIL'],
    'start_date': days_ago(0),
    'email_on_failure': True,
}

dag = DAG(
    dag_id='games', 
    default_args=default_args, 
    schedule_interval="* * * * *"
)

t1 = BashOperator(
    task_id='sklearn_pipeline',
    bash_command='sudo docker run sklearn_pipeline',
    dag=dag
)

Writing scripts/airflow/sklearn.py


In [2]:
%%file scripts/airflow/sklearn_docker.py
import os
import datetime

from airflow import DAG
from airflow.operators.docker_operator import DockerOperator
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'Airflow',
    'depends_on_past': False,
    'email': os.environ['FAILURE_EMAIL'],
    'start_date': days_ago(0),
    'email_on_failure': True,
}

dag = DAG(
    dag_id='games_docker', 
    default_args=default_args, 
    schedule_interval="* * * * *"
)

t1 = DockerOperator(
    task_id='sklearn_pipeline',
    image='sklearn_pipeline',
    dag=dag
)

Writing scripts/airflow/sklearn_docker.py


## Managed Airflow

In [2]:
%%file scripts/airflow/sklearn_gke.py
import os
import datetime

from airflow import DAG
from airflow.contrib.operators.gcp_container_operator import GKEPodOperator
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'Airflow',
    'depends_on_past': False,
    'email': os.environ['GOOGLE_FAILURE_EMAIL'],
    'start_date': days_ago(0),
    'email_on_failure': True,
}

dag = DAG(
    dag_id='games_docker', 
    default_args=default_args, 
    schedule_interval="* * * * *"
)

t1 = GKEPodOperator(
    task_id='sklearn_pipeline',
    project_id=os.environ['GOOGLE_PROJECT_ID'],
    cluster_name=os.environ['GOOGLE_GKE_CLUSTER_NAME'],
    name='sklearn-pipeline',
    namespace='default',
    location=os.environ['GOOGLE_GKE_CLUSTER_LOCATION'],
    image=f"us.gcr.io/{os.environ['GOOGLE_PROJECT_ID']}/sklearn_pipeline",
    dag=dag
)

Writing scripts/airflow/sklearn_gke.py
