In [1]:
# Import appropriate modules from the client library.
from googleads import dfp
from collections import defaultdict
from pyspark.sql.types import *
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import expr
from pyspark.sql.functions import *
import pandas as pd
import time
sc.stop()
sc =SparkContext()
sqlCtx = SQLContext(sc)


#connect to the dfp network
from googleads import dfp
dfp_client = dfp.DfpClient.LoadFromStorage()
network_service = dfp_client.GetService('NetworkService', version='v201708')
current_network = network_service.getCurrentNetwork()


In [17]:
def orders():
    print "Obtaining orders"
    client = dfp.DfpClient.LoadFromStorage()
    # Initialize appropriate service.
    order_service = client.GetService('OrderService', version='v201708')

    # Create a statement to select orders.
    statement = dfp.StatementBuilder()
    count = 0
    data_orders = pd.DataFrame(columns=['order_id','order_name','order_custom_fields','order_custom_fields_value'])
    while True:
        response = order_service.getOrdersByStatement(statement.ToStatement())
        

        if 'results' in response:
                for order in response['results']:
                    list_items = []

                    if "customFieldValues" in order:
                        for custom_field in order["customFieldValues"]:
                            if dfp.DfpClassType(custom_field) == 'CustomFieldValue':                                           
                                data_orders.loc[count] = [order.id,str(order.name),custom_field.customFieldId,str(custom_field.value.value)]
                                count += 1
                            elif dfp.DfpClassType(custom_field) == 'DropDownCustomFieldValue':
                                data_orders.loc[count] = [order.id,str(order.name),custom_field.customFieldId,str(custom_field.customFieldOptionId)]   
                                count += 1
                statement.offset += statement.limit
        else:
            break

    data_orders_df = sqlCtx.createDataFrame(data_orders)
    data_orders_df.write.mode('overwrite').parquet("gs://ds-url-catag/dfp_new/orders")
    print "Obtained orders"
    return()



def get_customfields_highlevel():
    print "Obtaining custom fields high level"
    start_time = time.time()
    # Initialize appropriate service.
    client = dfp.DfpClient.LoadFromStorage()
    custom_field_service = client.GetService('CustomFieldService', version='v201708')

    # Create a statement to select custom fields.
    statement = dfp.StatementBuilder()
    # Retrieve a small amount of custom fields at a time, paging
    # through until all custom fields have been retrieved.
    count = 0
    data_cus_high = pd.DataFrame(columns=['cus_id','cus_name'])
    while True:
        response = custom_field_service.getCustomFieldsByStatement(statement.ToStatement())
        if 'results' in response:
            for custom_field in response['results']:
                # Print out some information for each custom field.
                data_cus_high.loc[count] = [str(custom_field['id']),str(custom_field['name'])]
                count += 1
            statement.offset += statement.limit
        else:
            break
    data_cus_high_df = sqlCtx.createDataFrame(data_cus_high)
    data_cus_high_df.write.mode('overwrite').parquet("gs://ds-url-catag/dfp_new/custom_fields_high_level")
    print "Custom fields high level obtained"
    return(start_time)

def get_customfields_detailed():
    print "Obtaining custom fields detailed"
    # Initialize appropriate service.
    client = dfp.DfpClient.LoadFromStorage()
    custom_field_service = client.GetService('CustomFieldService', version='v201708')

    # Create a statement to select custom fields.
    statement = dfp.StatementBuilder()
    count = 0
    data_cus_detailed = pd.DataFrame(columns=['customFieldId','id','displayName'])
    while True:
        response = custom_field_service.getCustomFieldsByStatement(statement.ToStatement())
        if 'results' in response:
            for custom_field in response['results']:
                if  custom_field['id'] == 8796 or custom_field['id'] == 8916:
                    for sub_field in custom_field['options']:
                        data_cus_detailed.loc[count] = [str(sub_field['id']),str(sub_field['customFieldId']),str(sub_field['displayName'])]
                        count += 1
            statement.offset += statement.limit
        else:
            break
    print "Custom fields detailed level obtained"
    data_cus_detailed_df = sqlCtx.createDataFrame(data_cus_detailed)
    data_cus_detailed_df.write.mode('overwrite').parquet("gs://ds-url-catag/dfp_new/custom_fields_detailed")
    return()
    
    
#processing orderdata 
def process_order():
    print "processing orders"
    data_orders_new = sqlCtx.read.parquet("gs://ds-url-catag/dfp_new/orders/")

    #preprocess orders
    #reading in data
    custom_fields = sqlCtx.read.parquet("gs://ds-url-catag/dfp_new/custom_fields_detailed/")
    custom_fields_high = sqlCtx.read.parquet("gs://ds-url-catag/dfp_new/custom_fields_high_level/")
    #joining to get the values of ids
    version_1 = data_orders_new.join(custom_fields,custom_fields.customFieldId == data_orders_new.order_custom_fields_value,'left_outer').select([(xx) for xx in data_orders_new.columns+[custom_fields.displayName.alias('Values')]])
    version_2 = version_1.join(custom_fields_high,custom_fields_high.cus_id == version_1.order_custom_fields,'left_outer').select([(xx) for xx in version_1.columns+[custom_fields_high.cus_name]])

    #transformations
    df = version_2.withColumn('Values',when(col('Values').isNull() ,col('order_custom_fields_value')).otherwise(col('Values')))

    #transposing rows to columns

    data_orders_piv = df.groupBy("order_id","order_name").pivot("cus_name").agg(expr("coalesce(first(Values))"))

    data_orders_fin = data_orders_piv.select(col("order_id").alias("order_id"),col("order_name").alias("Order_name"),col("BBM Performance Category").alias("BBMPerformanceCategory"), col("IAB Tier 1 Categorization").alias("IABTier1Categorization"), col("IO End Date").alias("IOEndDate"), col("IO Start Date").alias("IOStartDate"), col("MAI Campaign Flag").alias("MAICampaignFlag"), col("Net IO Value (After Discount)").alias("NetIOValue"), col("Offer Type").alias("OfferType"))
    data_orders_fin.write.mode('overwrite').parquet("gs://ds-url-catag/dfp_new/orders_final")
    data_orders_fin.write.mode('overwrite').parquet("gs://ds-taste-dfp/order_details")
    print "Orders processed"
    
    return()


# def line_items():
#     print "Obtaining line items"
#     # Initialize appropriate service.
#     client = dfp.DfpClient.LoadFromStorage()
#     line_item_service = client.GetService('LineItemService', version='v201708')

#     # Create a statement to select line items.
#     statement = dfp.StatementBuilder()

#     data_line = pd.DataFrame(columns=['line_item_id','order_id','line_item_name'])
#     count_line = 0
#     # Retrieve a small amount of line items at a time, paging
#     # through until all line items have been retrieved.
#     while True:
#         response_line = line_item_service.getLineItemsByStatement(statement.ToStatement())
#         if 'results' in response_line:
#             for line_item in response_line['results']:
#                 data_line.loc[count_line] = [str(line_item.id),str(line_item.orderId),str(line_item.name)]
#                 count_line += 1
#             statement.offset += statement.limit                
#         else:
#             break

#     data_line_df = sqlCtx.createDataFrame(data_line)
#     data_line_df.write.mode('overwrite').parquet("gs://ds-url-catag/dfp_new/line_items")
#     print ("line items obtained")
#     return()

# def lica():
#     print "obtaining licas"
#     start_time = time.time()
#     # Initialize appropriate service.
#     client = dfp.DfpClient.LoadFromStorage()
#     lica_service = client.GetService('LineItemCreativeAssociationService', version='v201708')

#     # Create a statement to select line item creative associations.
#     statement = dfp.StatementBuilder()

#     data_line_creative = pd.DataFrame(columns=['line_item_id','creativeid'])
#     count_lica = 0
#     # Retrieve a small amount of line item creative associations at a time, paging
#     # through until all line item creative associations have been retrieved.
#     while True:
#         response = lica_service.getLineItemCreativeAssociationsByStatement(
#         statement.ToStatement())
#         if 'results' in response:
#             for lica in response['results']:
#                 data_line_creative.loc[count_lica] = [lica.lineItemId,lica.creativeId]
#                 count_lica += 1
#             statement.offset += statement.limit
#         else:
#             break
    
#     data_line_creative_df = sqlCtx.createDataFrame(data_line_creative)
#     data_line_creative_df.write.mode('overwrite').parquet("gs://ds-url-catag/dfp_new/lica")
#     print ("licas obtained")
    
#     return(start_time)


# def creatives():
#     print "obtaining creatives"
#     # Initialize appropriate service.
#     client = dfp.DfpClient.LoadFromStorage()
#     creative_service = client.GetService('CreativeService', version='v201708')

#     # Create a statement to select creatives.
#     statement = dfp.StatementBuilder()

#     #define dataframes to store the crawled data
#     data_temp = pd.DataFrame(columns=['creative_id','creative_name','column_name','column_value'])
#     data_cus = pd.DataFrame(columns=['creative_id','creative_name','column_name','column_value'])
#     count_temp = 0
#     count_cus = 0
#     while True:
#         response_creative = creative_service.getCreativesByStatement(statement.ToStatement())
#         if 'results' in response_creative:
#             for creative in response_creative['results']:
#                 list_values = []
#                 if dfp.DfpClassType(creative) == 'TemplateCreative':
#                     if "creativeTemplateVariableValues" in creative:
#                         for creative_value in creative["creativeTemplateVariableValues"]:
#                             if dfp.DfpClassType(creative_value) == 'StringCreativeTemplateVariableValue':
#                                 if len(creative_value) == 2: 
#                                     data_temp.loc[count_temp] =[creative.id,creative.name.encode('utf-8'),creative_value.uniqueName.encode('utf-8'),creative_value.value.encode('utf-8')]
#                                     count_temp += 1
#                                 else:
#                                     data_temp.loc[count_temp] =[creative.id,creative.name.encode('utf-8'),creative_value.uniqueName.encode('utf-8'),None]
#                                     count_temp += 1
#                 if dfp.DfpClassType(creative) == 'CustomCreative':
#                     data_cus.loc[count_cus]=[str(creative.id),str(creative.name),'htmlsnippet',str(creative.htmlSnippet)]
#                     count_cus += 1

#             statement.offset += statement.limit
#         else:
#             break
#     data_temp_creative_df = sqlCtx.createDataFrame(data_temp)
#     data_temp_creative_df.write.mode('overwrite').parquet("gs://ds-url-catag/dfp_new/template_creatives")
#     data_cus_creative_df = sqlCtx.createDataFrame(data_cus)
#     data_cus_creative_df.write.mode('overwrite').parquet("gs://ds-url-catag/dfp_new/cus_creatives")
#     return()

# def process_creatives():
#     print "processing creatives"
#     #preprocess creatives and join
#     data_temp_creatives = sqlCtx.read.parquet("gs://ds-url-catag/dfp_new/template_creatives")
#     data_cus_creatives = sqlCtx.read.parquet("gs://ds-url-catag/dfp_new/cus_creatives")
#     data_temp_creatives = data_temp_creatives.withColumn('column_name',lower(data_temp_creatives.column_name))

#     #joining all the creative datas
#     creatives_data = data_temp_creatives.union(data_cus_creatives)
#     final_creatives_data = creatives_data.groupBy("creative_id","creative_name").pivot("column_name").agg(expr("coalesce(first(column_value))"))
#     final_creatives_data.write.mode('overwrite').parquet("gs://ds-url-catag/dfp_new/final_creatives")
#     return()


# def preprocess_data():
#     print "processing the entire data"
#     start_time = time.time()
#     from pyspark.sql.functions import col
#     data_line_creative_items = sqlCtx.read.parquet("gs://ds-url-catag/dfp_new/lica")
#     final_creatives_data = sqlCtx.read.parquet("gs://ds-url-catag/dfp_new/final_creatives")
#     data_line_items = sqlCtx.read.parquet("gs://ds-url-catag/dfp_new/line_items")
#     data_orders_fin = sqlCtx.read.parquet("gs://ds-url-catag/dfp_new/orders_final")
    
    
#     #joining all the tables
#     result_one = data_line_creative_items.join(final_creatives_data,data_line_creative_items.creativeid == final_creatives_data.creative_id).select([data_line_creative_items.line_item_id.alias('lineitemid')]+[(xx) for xx in final_creatives_data.columns])
#     result_two = result_one.join(data_line_items,data_line_items.line_item_id == result_one.lineitemid).select([data_line_items.order_id.alias('main_order_id'),data_line_items.line_item_id,data_line_items.line_item_name]+[(xx) for xx in result_one.columns])
#     keep = [data_orders_fin[c] for c in data_orders_fin.columns] + [result_two[c] for c in result_two.columns]


#     result_three= result_two.join(data_orders_fin,data_orders_fin.order_id == result_two.main_order_id,"left").select(*keep)
#     result_three.dropDuplicates().write.mode('overwrite').parquet("gs://ds-taste-dfp/raw_iab_data/31-10-2017/result_dfp")
#     print "processed"
#     return(start_time)

    
if __name__ == "__main__":

    orders()
    get_customfields_highlevel()
    get_customfields_detailed()
    process_order()
# #     line_items()
# #     lica()
# #     creatives()
#     process_creatives()
#     preprocess_data()




Obtaining orders
Obtained orders
Obtaining custom fields high level
Custom fields high level obtained
Obtaining custom fields detailed
Custom fields detailed level obtained
processing orders
Orders processed


In [18]:
 data_orders_new = sqlCtx.read.parquet("gs://ds-taste-dfp/order_details/")


In [19]:
data_orders_new.columns

['order_id',
 'Order_name',
 'BBMPerformanceCategory',
 'IABTier1Categorization',
 'IOEndDate',
 'IOStartDate',
 'MAICampaignFlag',
 'NetIOValue',
 'OfferType']