In [8]:
import pandas as pd
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
import os

In [9]:
hem_schema = "id:INTEGER, name:STRING, datetime:DATETIME, department_id:INTEGER, job_id:INTEGER"
table_spec = os.environ['GCP_PROJECT'] + ':globant.hired_employees'

In [10]:
def coalesce_null_ids(data):
    if data[3] == "":
        data[3] = '-1'
    if data[4] == "":
        data[4] = '-1'
    return data
    
def fix_timestamp(data):
    import datetime
    if data[2] == '':
        return data
    d = datetime.datetime.strptime(data[2], "%Y-%m-%dT%H:%M:%SZ")
    data[2] = d.strftime("%Y-%m-%d %H:%M:%S")
    return data

def create_dict(data):
    return {
        'id': data[0],
        'name': data[1],
        'datetime': data[2],
        'department_id': data[3],
        'job_id': data[4]
    }

In [12]:
options = PipelineOptions(
      project = os.environ['GCP_PROJECT'],
      temp_location = "gs://" + os.environ['GCP_BUCKET'] + "/beam_temp_location",
      region = os.environ['GCP_REGION']
   )
with beam.Pipeline(options=options) as pipe:
    ip = (
        pipe
        | "Read departments csv" >> beam.io.ReadFromText('gs://arojasb3-globant-challenge-2023/hired_employees.csv', skip_header_lines=False)
        | "Split CSV by commas" >> beam.Map(lambda x: x.split(","))
        | "Parse Timestamps for Bigquery" >> beam.Map(fix_timestamp)
        | "Coalesce empty departments and jobs" >> beam.Map(coalesce_null_ids)
        | "Parse list into dict for BQ" >> beam.Map(create_dict)
        | beam.io.WriteToBigQuery(
            table_spec,
            schema=hem_schema,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
        )
    )
    pipe.run()

