In [1]:
import tensorflow as tf
import numpy as np
import gzip
import json
import boto3
import os

  from ._conv import register_converters as _register_converters


In [2]:
s3 = boto3.resource('s3')
s3client = boto3.client('s3')

bucket_name = "codebase-pm-dpf"

objects = s3client.list_objects_v2(Bucket=bucket_name)

bucket = s3.Bucket(name=bucket_name)

file_keys = []
test_keys = []
for n in range(20):
    file_keys.append("13/15/part-00%03d.gz" % n)
    test_keys.append("14/15/part-00%03d.gz" % n)
    
if os.path.exists("input.gz"):
    os.remove("input.gz")

train_data = []
train_labels = []
test_data = []
test_labels = []

In [3]:
for file_key in file_keys:
    print(file_key)
    bucket.download_file(Key=file_key, Filename="input.gz")
    
    with gzip.open("input.gz", 'rt') as f:
        lines = f.readlines()
        f.close()
    
    for line in lines:
        data = json.loads(line)
        values = []
        
        values.append(data.get("ad_type", [""])[0])
        values.append(data.get("geo_country_code2", ""))
        values.append(data.get("rate_metric", ""))
        values.append(str(data.get("site_id", "")))
        values.append(data.get("geo_timezone", ""))
        values.append(data.get("ua_device_type", ""))
        values.append(str(len(data.get("bid_requests", []))))
        values.append(data.get("ua_os", ""))
        values.append(str(data.get("zone_id", "")))
        values.append(data.get("geo_continent_code", ""))
        values.append(data.get("ua_os_name", ""))
        values.append(data.get("ua_device", ""))
        values.append(data.get("ua_name", ""))
        values.append(str(data.get("geo_area_code", "")))
        values.append(data.get("geo_city_name", ""))
        values.append(str(data.get("r_timestamp", data.get("i_timestamp", "1T15")).split("T")[1][:2]))
        
        if len(data["bid_responses"]) == 0:
            continue
        if len(data["bid_responses"]) == 1:
            label = data["bid_responses"][0]["bid_price"] / 2
        else:
            bids = sorted([x["bid_price"] for x in data["bid_responses"]])
            label = bids[-2] + (bids[-1] - bids[-2]) / 2
        
        train_data.append(values)
        train_labels.append(label)
    os.remove("input.gz")

13/15/part-00000.gz
13/15/part-00001.gz
13/15/part-00002.gz
13/15/part-00003.gz
13/15/part-00004.gz
13/15/part-00005.gz
13/15/part-00006.gz
13/15/part-00007.gz
13/15/part-00008.gz
13/15/part-00009.gz
13/15/part-00010.gz
13/15/part-00011.gz
13/15/part-00012.gz
13/15/part-00013.gz
13/15/part-00014.gz
13/15/part-00015.gz
13/15/part-00016.gz
13/15/part-00017.gz
13/15/part-00018.gz
13/15/part-00019.gz


In [4]:
for file_key in test_keys:
    print(file_key)
    bucket.download_file(Key=file_key, Filename="input.gz")
    
    with gzip.open("input.gz", 'rt') as f:
        lines = f.readlines()
        f.close()
    
    for line in lines:
        data = json.loads(line)
        values = []
        
        values.append(data.get("ad_type", [""])[0])
        values.append(data.get("geo_country_code2", ""))
        values.append(data.get("rate_metric", ""))
        values.append(str(data.get("site_id", "")))
        values.append(data.get("geo_timezone", ""))
        values.append(data.get("ua_device_type", ""))
        values.append(str(len(data.get("bid_requests", []))))
        values.append(data.get("ua_os", ""))
        values.append(str(data.get("zone_id", "")))
        values.append(data.get("geo_continent_code", ""))
        values.append(data.get("ua_os_name", ""))
        values.append(data.get("ua_device", ""))
        values.append(data.get("ua_name", ""))
        values.append(str(data.get("geo_area_code", "")))
        values.append(data.get("geo_city_name", ""))
        values.append(str(data.get("r_timestamp", data.get("i_timestamp", "1T15")).split("T")[1][:2]))
        
        if len(data["bid_responses"]) == 0:
            continue
        if len(data["bid_responses"]) == 1:
            label = data["bid_responses"][0]["bid_price"] / 2
        else:
            bids = sorted([x["bid_price"] for x in data["bid_responses"]])
            label = bids[-2] + (bids[-1] - bids[-2]) / 2
        
        test_data.append(values)
        test_labels.append(label)
    os.remove("input.gz")

14/15/part-00000.gz
14/15/part-00001.gz
14/15/part-00002.gz
14/15/part-00003.gz
14/15/part-00004.gz
14/15/part-00005.gz
14/15/part-00006.gz
14/15/part-00007.gz
14/15/part-00008.gz
14/15/part-00009.gz
14/15/part-00010.gz
14/15/part-00011.gz
14/15/part-00012.gz
14/15/part-00013.gz
14/15/part-00014.gz
14/15/part-00015.gz
14/15/part-00016.gz
14/15/part-00017.gz
14/15/part-00018.gz
14/15/part-00019.gz


In [None]:
print(len(train_data))
print(len(train_labels))
print(len(test_data))
print(len(test_labels))
params = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
    num_classes=1, # num dimensions of output
    num_features=len(train_data[0]), # should be 16
    regression=True
)

classifier = tf.contrib.tensor_forest.client.random_forest.TensorForestEstimator(params, model_dir="./")

classifier.fit(x=np.array(train_data), y=np.array(train_labels))

y_out = classifier.predict(x=test_data)

1396456
1396456
1337029
1337029
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_num_ps_replicas': 0, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0d7f5b7278>, '_num_worker_replicas': 0, '_tf_random_seed': None, '_model_dir': './', '_save_summary_steps': 100, '_task_id': 0, '_log_step_count_steps': 100, '_task_type': None, '_session_config': None, '_environment': 'local', '_is_chief': True, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_master': '', '_save_checkpoints_secs': 600}
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))