In [1]:
import boto3
import datetime as dt
import pytz
import time
import uuid
import json
import s3fs
import pandas as pd
import requests
import numpy as np
from decimal import *

In [2]:

tz = pytz.timezone('EST')

today = dt.datetime.now(tz)
sevenday = today - dt.timedelta(days=7)


In [3]:

# query athena view of impression, click, and joined conversion data for tag_id 3
query = """
select * from (SELECT  conversions,
         campaign_id,
         impressions,
         clicks,
         app_bundle,
         ad_position,
         geo_region,
         geo_country,
         platform_browser,
         platform_os,
         rewarded,
         platform_carrier,
         platform_device_make,
         platform_device_model,
         video_player_size,
         video_completes,
         content_language,
         companion_views,
         companion_clicks,
         banner_width,
         banner_height,
         inventory_source,
         inventory_interstitial,
         spend
FROM "fb-prod"."fb-model-builder" TABLESAMPLE BERNOULLI (0.03)  where conversions = 0
UNION ALL
SELECT  conversions,
         campaign_id,
         impressions,
         clicks,
         app_bundle,
         ad_position,
         geo_region,
         geo_country,
         platform_browser,
         platform_os,
         rewarded,
         platform_carrier,
         platform_device_make,
         platform_device_model,
         video_player_size,
         video_completes,
         content_language,
         companion_views,
         companion_clicks,
         banner_width,
         banner_height,
         inventory_source,
         inventory_interstitial,
         spend
FROM "fb-prod"."fb-model-builder" where conversions > 0) order by random()
"""


In [4]:

# set database to facebook
database = 'fb-exp'
#set s3 output file for athena query
s3_output = 's3://fb-beeswax/brian/athena/%s/'%(dt.datetime.now(tz).strftime('%Y-%m-%d-%H%M%S'))

#Function for starting athena query
def run_query(query, database, s3_output):
    client = boto3.client('athena', region_name='us-east-1')
    response = client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={
            'Database': database
            },
        ResultConfiguration={
            'OutputLocation': s3_output,
            }
        )
    print('Execution ID: ' + response['QueryExecutionId'])
    return response

#run athena query and kick back job id
job = run_query(query, database, s3_output)

job_id = job['QueryExecutionId']
client = boto3.client('athena', region_name='us-east-1')
res = client.get_query_execution(QueryExecutionId= job_id)
x = 0

# waith for athena to return results
while res['QueryExecution']['Status']['State'] != 'SUCCEEDED':
    print("query processing for %s seconds" %str(x))
    time.sleep(10)
    x = x + 10
    res = client.get_query_execution(QueryExecutionId= job_id)


Execution ID: e1ef0d46-c74a-427a-a7e4-5ce0478bd8a2
query processing for 0 seconds


In [5]:

#set output location for query results
output = res['QueryExecution']['ResultConfiguration']['OutputLocation']


In [7]:

#set aws machine learning client and build schema for datasource
mlclient = boto3.client('machinelearning', region_name='us-east-1')

data_id1 = str(uuid.uuid4())
data_name = 'fb-datasource-%s'%(dt.datetime.now(tz).strftime('%Y-%m-%d-%H%M%S'))


In [8]:
schema = {
        'excludedAttributeNames': [], 
        'version': '1.0', 
        'dataFormat': 'CSV', 
        'rowId': None, 
        'dataFileContainsHeader': True, 
        'attributes': [
            {
                'attributeName': 'conversions', 
                'attributeType': 'BINARY'
            }, {
                'attributeName' : 'campaign_id',
                'attributeType' : 'CATEGORICAL'
            }, {
                'attributeName': 'impressions', 
                'attributeType': 'NUMERIC'
            }, {
                'attributeName': 'clicks', 
                'attributeType': 'NUMERIC'
            }, {
                'attributeName': 'app_bundle', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'ad_position', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'geo_region', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'geo_country', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'platform_browser', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'platform_os', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'rewarded', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'platform_carrier', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'platform_device_make', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'platform_device_model', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'video_player_size', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'video_completes', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'content_language', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'companion_views', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'companion_clicks', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'banner_width', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'banner_height', 
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'inventory_source',
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'inventory_interstitial',
                'attributeType': 'CATEGORICAL'
            }, {
                'attributeName': 'spend',
                'attributeType': 'NUMERIC'
            }
        ], 
        'targetAttributeName': 'conversions', 
        'rowWeight': None
    }

data_rearrangement1 = {
    "splitting": {
        "percentBegin" : 0, 
        "percentEnd" : 70, 
        "strategy" : "sequential"
    }
}

data_rearrangement2 = {
    "splitting": {
        "percentBegin" : 0, 
        "percentEnd" : 70, 
        "strategy" : "sequential",
        "complement" : "true"
    }
}


In [None]:
# load new datasource into aws machine learning
def create_datasource(percent, data_rearrange, datasource_id):
    response = mlclient.create_data_source_from_s3(DataSourceId=datasource_id, DataSourceName="{}-{}".format(data_name, percent), DataSpec={"DataLocationS3" : output, "DataSchema" : json.dumps(schema), "DataRearrangement" : json.dumps(data_rearrange)}, ComputeStatistics=True)
    get_data_source = mlclient.get_data_source(DataSourceId=datasource_id)
    ti = 0
    while get_data_source['Status'] != 'COMPLETED':
        time.sleep(20)
        ti = ti + 20
        print("data source processing for %s seconds" %str(ti))
        get_data_source = mlclient.get_data_source(DataSourceId=datasource_id)

data_id2 = str(uuid.uuid4())
create_datasource("0-70", data_rearrangement1, data_id1)

data source processing for 20 seconds
data source processing for 40 seconds
data source processing for 60 seconds
data source processing for 80 seconds
data source processing for 100 seconds
data source processing for 120 seconds
data source processing for 140 seconds
data source processing for 160 seconds


In [None]:
create_datasource("70-100", data_rearrangement2, data_id2)


In [None]:

#build new ML model in AWS machine learning
model_id = str(uuid.uuid4())
model_name = 'fb-ml-model-%s'%(dt.datetime.now(tz).strftime('%Y-%m-%d-%H%M%S'))
resmodel = mlclient.create_ml_model(MLModelId=model_id, MLModelName=model_name, TrainingDataSourceId=data_id1, MLModelType='BINARY')

get_model = mlclient.get_ml_model(MLModelId=model_id)
tm = 0
while get_model['Status'] != 'COMPLETED':
    time.sleep(60)
    tm = tm + 1
    print("ML Model processing for %s minutes" %str(tm))
    get_model = mlclient.get_ml_model(MLModelId=model_id)


In [None]:
ev_id = str(uuid.uuid4())

evaluation_id = mlclient.create_evaluation(
    EvaluationId= ev_id,
    EvaluationName= 'fb-ml-evaluation-%s'%(dt.datetime.now(tz).strftime('%Y-%m-%d-%H%M%S')),
    MLModelId= model_id,
    EvaluationDataSourceId= data_id2
)

get_evaluation = mlclient.get_evaluation(
    EvaluationId=ev_id
)
em = 0
while get_evaluation['Status'] != 'COMPLETED':
    time.sleep(60)
    em = em + 1
    print("Evaluation 1 processing for %s minutes" %str(em))
    get_model = mlclient.get_evaluation(EvaluationId=ev_id)



In [None]:
ev_id2 = str(uuid.uuid4())

evaluation_id = mlclient.create_evaluation(
    EvaluationId= ev_id2,
    EvaluationName= 'fb-ml-evaluation-on-trained-data-%s'%(dt.datetime.now(tz).strftime('%Y-%m-%d-%H%M%S')),
    MLModelId= model_id,
    EvaluationDataSourceId= data_id1
)

get_evaluation = mlclient.get_evaluation(
    EvaluationId=ev_id2
)
em = 0
while get_evaluation['Status'] != 'COMPLETED':
    time.sleep(60)
    em = em + 1
    print("Evaluation 2 processing for %s minutes" %str(em))
    get_model = mlclient.get_evaluation(EvaluationId=ev_id2)


