### Week 3
### DSC 650
### Abed Tabbalat

In [2]:
# Mounting drive
import os
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
os.chdir('/content/drive/My Drive/DSC650/Original/dsc650/assignments/assignment03')
!pwd

Mounted at /content/drive
/content/drive/My Drive/DSC650/Original/dsc650/assignments/assignment03


In [3]:
!pip install s3fs
!pip install fastavro
!pip install pygeohash
!pip install snappy
!pip install genson
!pip install python-snappy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting s3fs
  Downloading s3fs-2023.3.0-py3-none-any.whl (27 kB)
Collecting aiobotocore~=2.4.2
  Downloading aiobotocore-2.4.2-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 KB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp!=4.0.0a0,!=4.0.0a1
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
Collecting aioitertools>=0.5.1
  Downloading aioitertools-0.11.0-py3-none-any.whl (23 kB)
Collecting botocore<1.27.60,>=1.27.59
  Downloading botocore-1.27.59-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m101.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting yarl<2.0,>=1.0
  Downloading yarl-1.8.2-cp39-c

In [4]:
import os
import sys
import gzip
import json
from pathlib import Path
import csv

import pandas as pd
import s3fs
import pyarrow as pa
from pyarrow.json import read_json
import pyarrow.parquet as pq
import fastavro
from fastavro import parse_schema
from fastavro import writer
import pygeohash
import jsonschema
from jsonschema.exceptions import ValidationError
from genson import SchemaBuilder
import routes_pb2
import snappy as snappy

endpoint_url='https://storage.budsc.midwest-datascience.com'

current_dir = Path(os.getcwd()).absolute()
schema_dir = current_dir.joinpath('schemas')
results_dir = current_dir.joinpath('results')
results_dir.mkdir(parents=True, exist_ok=True)


def read_jsonl_data():
    s3 = s3fs.S3FileSystem(
        anon=True,
        client_kwargs={
            'endpoint_url': endpoint_url
        }
    )
    src_data_path = '../../../data/processed/openflights/routes.jsonl.gz'
    #with s3.open(src_data_path, 'rb') as f_gz:
    with gzip.open(src_data_path, 'rb') as f:
        records = [json.loads(line) for line in f.readlines()]
        

    return records

In [5]:
os.getcwd()

'/content/drive/MyDrive/DSC650/Original/dsc650/assignments/assignment03'

In [7]:
records = read_jsonl_data()
records[0:1]

[{'airline': {'airline_id': 410,
   'name': 'Aerocondor',
   'alias': 'ANA All Nippon Airways',
   'iata': '2B',
   'icao': 'ARD',
   'callsign': 'AEROCONDOR',
   'country': 'Portugal',
   'active': True},
  'src_airport': {'airport_id': 2965,
   'name': 'Sochi International Airport',
   'city': 'Sochi',
   'country': 'Russia',
   'iata': 'AER',
   'icao': 'URSS',
   'latitude': 43.449902,
   'longitude': 39.9566,
   'altitude': 89,
   'timezone': 3.0,
   'dst': 'N',
   'tz_id': 'Europe/Moscow',
   'type': 'airport',
   'source': 'OurAirports'},
  'dst_airport': {'airport_id': 2990,
   'name': 'Kazan International Airport',
   'city': 'Kazan',
   'country': 'Russia',
   'iata': 'KZN',
   'icao': 'UWKD',
   'latitude': 55.606201171875,
   'longitude': 49.278701782227,
   'altitude': 411,
   'timezone': 3.0,
   'dst': 'N',
   'tz_id': 'Europe/Moscow',
   'type': 'airport',
   'source': 'OurAirports'},
  'codeshare': False,
  'equipment': ['CR2']}]

## 3.1

### 3.1.a JSON Schema

In [8]:
def create_schema_file(records, schema_file_path):
    builder = SchemaBuilder()
    builder.add_schema({"type": "object", "properties": {}})
    builder.add_object(records)
    schema = builder.to_json(indent=2)

    with open(schema_file_path, 'w') as f:
        f.write(schema)

    return schema


schema_path = schema_dir.joinpath('routes-schema.json')
schema = create_schema_file(records, schema_path)
print(schema)

{
  "$schema": "http://json-schema.org/schema#",
  "anyOf": [
    {
      "type": "object"
    },
    {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "airline": {
            "type": "object",
            "properties": {
              "airline_id": {
                "type": "integer"
              },
              "name": {
                "type": "string"
              },
              "alias": {
                "type": "string"
              },
              "iata": {
                "type": "string"
              },
              "icao": {
                "type": "string"
              },
              "callsign": {
                "type": "string"
              },
              "country": {
                "type": "string"
              },
              "active": {
                "type": "boolean"
              }
            },
            "required": [
              "active",
              "airline_id",
              "alias",


In [9]:
def create_schema_file(records, schema_file_path):
    builder = SchemaBuilder()
    builder.add_schema({"type": "object", "properties": {}})
    builder.add_object(records)
    schema = builder.to_json(indent=2)

    with open(schema_file_path, 'w') as f:
        f.write(schema)

    return schema


def validate_jsonl_data(records, schema_path, validation_csv_path):
    with open(schema_path) as f:
        schema = json.load(f)

    with open(validation_csv_path, 'w', newline='') as f:
        for i, record in enumerate(records):
            writer = csv.writer(f)

            try:
                jsonschema.validate(record, schema)
            except ValidationError as e:
                f.write(f"Error: {e.message}; failed validating {e.validator} in schema {e.schema_path}\r\n")
                print(e)


schema_path = schema_dir.joinpath('routes-schema.json')
schema = create_schema_file(records, schema_path)

validation_csv_path = results_dir.joinpath('validation-results.csv')
validate_jsonl_data(records, schema_path, validation_csv_path)

### 3.1.b Avro

In [10]:
def create_avro_dataset(records):
    schema_path = schema_dir.joinpath('routes.avsc')
    data_path = results_dir.joinpath('routes.avro')
    ## TODO: Use fastavro to create Avro dataset
    

    with open(schema_path, 'r') as f:
        schema = json.load(f)


    parsed_schema = parse_schema(schema)


    with open(data_path, 'wb') as out:
        writer(out, parsed_schema, records)
    
        
create_avro_dataset(records)

### 3.1.c Parquet

In [11]:
def create_avro_dataset(records):
    schema_path = schema_dir.joinpath('routes.avsc')
    data_path = results_dir.joinpath('routes.avro')

    with open(schema_path, 'r') as f:
        schema = json.load(f)

    parsed_schema = parse_schema(schema)

    with open(data_path, 'wb') as out:
        writer(out, parsed_schema, records)

create_avro_dataset(records)

### 3.1.d Protocol Buffers

In [12]:
sys.path.insert(0, os.path.abspath('routes_pb2'))

def _airport_to_proto_obj(airport):
    obj = routes_pb2.Airport()
    if airport is None:
        return None
    if airport.get('airport_id') is None:
        return None

    obj.airport_id = airport.get('airport_id')
    if airport.get('name'):
        obj.name = airport.get('name')
    if airport.get('city'):
        obj.city = airport.get('city')
    if airport.get('iata'):
        obj.iata = airport.get('iata')
    if airport.get('icao'):
        obj.icao = airport.get('icao')
    if airport.get('altitude'):
        obj.altitude = airport.get('altitude')
    if airport.get('timezone'):
        obj.timezone = airport.get('timezone')
    if airport.get('dst'):
        obj.dst = airport.get('dst')
    if airport.get('tz_id'):
        obj.tz_id = airport.get('tz_id')
    if airport.get('type'):
        obj.type = airport.get('type')
    if airport.get('source'):
        obj.source = airport.get('source')

    obj.latitude = airport.get('latitude')
    obj.longitude = airport.get('longitude')

    return obj


def _airline_to_proto_obj(airline):
    obj = routes_pb2.Airline()

    if airline is None:
        return None
    if airline.get('airline_id') is None:
        return None

    obj.airline_id = airline.get('airline_id')

    if airline.get('name'):
        obj.name = airline.get('name')
    if airline.get('alias'):
        obj.name = airline.get('alias')
    if airline.get('iata'):
        obj.name = airline.get('iata')
    if airline.get('icao'):
        obj.name = airline.get('icao')
    if airline.get('callsign'):
        obj.name = airline.get('callsign')
    if airline.get('country'):
        obj.name = airline.get('country')
    
    obj.active = airline.get('active') # boolean
    
    return obj



def create_protobuf_dataset(records):
    routes = routes_pb2.Routes()
    for record in records:
        route = routes_pb2.Route()

        airline = _airline_to_proto_obj(record.get('airline'))
        if airline:
            route.airline.CopyFrom(airline)

        src_airport = _airport_to_proto_obj(record.get('src_airport'))
        if src_airport:
            route.src_airport.CopyFrom(src_airport)

        dst_airport = _airport_to_proto_obj(record.get('dst_airport'))
        if dst_airport:
            route.dst_airport.CopyFrom(dst_airport)

        route.codeshare = record.get('codeshare')

        equipment = record.get('equipment')
        for equip in equipment:
            route.equipment.append(equip)

        routes.route.append(route)

    data_path = results_dir.joinpath('routes.pb')

    with open(data_path, 'wb') as f:
        f.write(routes.SerializeToString())

    compressed_path = results_dir.joinpath('routes.pb.snappy')

    with open(compressed_path, 'wb') as f:
        f.write(snappy.compress(routes.SerializeToString()))

create_protobuf_dataset(records)

## 3.2

### 3.2.a Simple Geohash Index

In [13]:
def create_hash_dirs(records):
    geoindex_dir = results_dir.joinpath('geoindex')
    geoindex_dir.mkdir(exist_ok=True, parents=True)
    hashes = []

    for record in records:
        src_airport = record.get('src_airport', {})

        if src_airport:
            lat = src_airport.get('latitude')
            lon = src_airport.get('longitude')
            if lat and lon:
                hashes.append(pygeohash.encode(lat, lon))

    hashes.sort()
    three_char = sorted(list(set([entry[:3] for entry in hashes])))
    hash_index = {value: [] for value in three_char}

    for record in records:
        geohash = record.get('geohash')
        if geohash:
            hash_index[geohash[:3]].append(record)

    for key, values in hash_index.items():
        output_dir = geoindex_dir.joinpath(str(key[:1])).joinpath(str(key[:2]))
        output_dir.mkdir(exist_ok=True, parents=True)
        output_path = output_dir.joinpath(f"{key}.jsonl.gz")

        with gzip.open(output_path, 'w') as f:
            json_output = '\n'.join([json.dumps(value) for value in values])
            f.write(json_output.encode('utf-8'))

create_hash_dirs(records)

### 3.2.b Simple Search Feature

In [14]:
def airport_search(latitude, longitude):
    input_hash = pygeohash.encode(latitude, longitude)

    distance = 0
    name = ''

    for idx, record in enumerate(records):
        src_airport = record.get('src_airport', {})

        if src_airport:
            lat = src_airport.get('latitude')
            lon = src_airport.get('longitude')
            airport_name = src_airport.get('name')

            if lat and lon:
                airport_hash = pygeohash.encode(lat, lon)
                dist_n = pygeohash.geohash_approximate_distance(input_hash, airport_hash)
                
                if idx == 0:
                    distance = dist_n
                else:
                    if distance > dist_n:
                        distance = dist_n
                        name = airport_name
                        
    print(f"Closest airport is {name}")

airport_search(41.1499988, -95.91779)

Closest airport is Eppley Airfield
