In [13]:
# pyenv global 3.6.13
# python3 -m pip install ipykernel avro_validator uuid kafka pypandoc pyspark pandas pyarrow

import avro_validator
import json
import uuid
import datetime
from kafka import KafkaProducer
import os

In [14]:
# Local file that will be send to Kafka row by row
filename_data   = 'routes.dat'
filename_schema = 'data_schema.json'

print(str(filename_data) + ' \t\t' + str(os.path.getsize(filename_data)) + ' bytes')
print(str(filename_schema) + ' \t' + str(os.path.getsize(filename_schema)) + ' bytes')

routes.dat 		2377148 bytes
data_schema.json 	1670 bytes


In [28]:
# Open data schema in external file
# avro schema style source: https://avro.apache.org/docs/current/spec.html
with open(filename_schema) as json_file:
    data_schema = json.load(json_file)
data_schema

{'type': 'record',
 'doc': 'This event records routes between airports on airlines.',
 'name': 'AirlineRouteEvent',
 'fields': [{'name': 'id',
   'type': 'string',
   'doc': 'A universally unique identifier that is generated using random numbers'},
  {'name': 'datetime',
   'type': 'string',
   'doc': 'The produced event datetime in UTC format'},
  {'name': 'airline',
   'type': 'string',
   'doc': '2-letter (IATA) or 3-letter (ICAO) code of the airline'},
  {'name': 'airline_id',
   'type': 'int',
   'doc': 'Unique OpenFlights identifier for airline'},
  {'name': 'source_airport',
   'type': 'string',
   'doc': '3-letter (IATA) or 4-letter (ICAO) code of the source airport'},
  {'name': 'source_airport_id',
   'type': 'int',
   'doc': 'Unique OpenFlights identifier for source airport'},
  {'name': 'destination_airport',
   'type': 'string',
   'doc': '3-letter (IATA) or 4-letter (ICAO) code of the destination airport'},
  {'name': 'destination_airport_id',
   'type': 'int',
   'doc': 

In [16]:
# Create Kafka producer
producer = KafkaProducer(
    bootstrap_servers=['localhost:9092'], 
    value_serializer=lambda x: json.dumps(x).encode('utf-8')
)

In [17]:
# Read local file
open_data = open(filename_data, 'r')
lines = open_data.readlines()

In [18]:
# Validate data and send row by row to Kafka

def strToBool(codeshare):
    if codeshare == 'Y':
        return True
    else:
        return False

count_error_transform_var    = 0
count_error_invalid_schema   = 0
count_success = 0
count_total = 0

# clear file
f = open("error.txt", "w")
f.close()

for line in lines[0:1000]:
# for line in lines:
    count_total += 1
    line = line.replace('\n','')
    listOfStr = line.strip().split(',')
    
    try:
    # Pragmatic approach
        line_dict = {
            'id':                      str(uuid.uuid4()),
            'datetime':                str(datetime.datetime.utcnow()),
            'airline':                 str(listOfStr[0]),
            'airline_id':              int(listOfStr[1]),
            'source_airport':          str(listOfStr[2]),
            'source_airport_id':       int(listOfStr[3]),
            'destination_airport':     str(listOfStr[4]),
            'destination_airport_id':  int(listOfStr[5]),
            'codeshare':               strToBool(listOfStr[6]),
            'stops':                   int(listOfStr[7]),
            'equipment':               str(listOfStr[8]),
        }
        parsed_schema = avro_validator.schema.Schema(json.dumps(data_schema)).parse()
        if parsed_schema.validate(line_dict):
            producer.send("routes_data", value=line_dict)
            count_success += 1
        else:
            f = open("error.txt", "a")
            f.write('Schema invalid: ' + line + '\n')
            f.close()
            count_error_invalid_schema += 1
    except:
        f = open("error.txt", "a")
        f.write('A variable type transformation failed: ' + line + '\n')
        f.close()
        count_error_transform_var += 1
        pass
    
print('Total errors by var transform\t' + str(count_error_transform_var) + '\t' + str(count_error_transform_var/count_total*100) + '%')
print('Total errors by invalid schema\t' + str(count_error_invalid_schema) + '\t' + str(count_error_invalid_schema/count_total*100) + '%')
print('Total success \t\t\t' + str(count_success) + '\t' + str(count_success/count_total*100) + '%')
print('Total count\t\t\t' + str(count_total) + '\t' + str(count_total/count_total*100) + '%')

Total errors by var transform	68	6.800000000000001%
Total errors by invalid schema	0	0.0%
Total success 			932	93.2%
Total count			1000	100.0%


In [25]:
# result = [
#     {'Total count': count_total},
#     {'Total count (%)': round(count_total/count_total*100,2)},
#     {'Total success': count_success},
#     {'Total success (%)': round(count_success/count_total*100,2)},
#     {'Total errors by var transform': count_error_transform_var},
#     {'Total errors by var transform (%)': round(count_error_transform_var/count_total*100, 2)},
#     {'Total errors by invalid schema': count_error_invalid_schema},
#     {'Total errors by invalid schema (%)': round(count_error_invalid_schema/count_total*100,2)}
# ]

[{'Total count': 1000},
 {'Total count (%)': 100.0},
 {'Total success': 932},
 {'Total success (%)': 93.2},
 {'Total errors by var transform': 68},
 {'Total errors by var transform (%)': 6.8},
 {'Total errors by invalid schema': 0},
 {'Total errors by invalid schema (%)': 0.0}]