## Create testutility.py

In [2]:
%%writefile testutility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime
import gc
import re


################
# File Reading #
################

def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)


def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string)
    return string

def col_header_val(df,table_config):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0

Overwriting testutility.py


## Write YAML file

In [3]:
%%writefile file.yaml
file_type: csv
dataset_name: flights
file_name: Combined_Flights_2021
file_path: data/Combined_Flights_2021.csv
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns:
    - FlightDate
    - Airline
    - Origin
    - Dest
    - Cancelled
    - Diverted
    - CRSDepTime
    - DepTime
    - DepDelayMinutes
    - DepDelay
    - ArrTime
    - ArrDelayMinutes
    - AirTime
    - CRSElapsedTime
    - ActualElapsedTime
    - Distance
    - Year
    - Quarter
    - Month
    - DayofMonth
    - DayOfWeek
    - Marketing_Airline_Network
    - Operated_or_Branded_Code_Share_Partners
    - DOT_ID_Marketing_Airline
    - IATA_Code_Marketing_Airline
    - Flight_Number_Marketing_Airline
    - Operating_Airline
    - DOT_ID_Operating_Airline
    - IATA_Code_Operating_Airline
    - Tail_Number
    - Flight_Number_Operating_Airline
    - OriginAirportID
    - OriginAirportSeqID
    - OriginCityMarketID
    - OriginCityName
    - OriginState
    - OriginStateFips
    - OriginStateName
    - OriginWac
    - DestAirportID
    - DestAirportSeqID
    - DestCityMarketID
    - DestCityName
    - DestState
    - DestStateFips
    - DestStateName
    - DestWac
    - DepDel15
    - DepartureDelayGroups
    - DepTimeBlk
    - TaxiOut
    - WheelsOff
    - WheelsOn
    - TaxiIn
    - CRSArrTime
    - ArrDelay
    - ArrDel15
    - ArrivalDelayGroups
    - ArrTimeBlk
    - DistanceGroup
    - DivAirportLandings

Overwriting file.yaml


In [4]:
# Read config file
import testutility as util
config_data = util.read_config_file("file.yaml")

In [5]:
#inspecting data of config file
config_data

{'file_type': 'csv',
 'dataset_name': 'flights',
 'file_name': 'Combined_Flights_2021',
 'file_path': 'data/Combined_Flights_2021.csv',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['FlightDate',
  'Airline',
  'Origin',
  'Dest',
  'Cancelled',
  'Diverted',
  'CRSDepTime',
  'DepTime',
  'DepDelayMinutes',
  'DepDelay',
  'ArrTime',
  'ArrDelayMinutes',
  'AirTime',
  'CRSElapsedTime',
  'ActualElapsedTime',
  'Distance',
  'Year',
  'Quarter',
  'Month',
  'DayofMonth',
  'DayOfWeek',
  'Marketing_Airline_Network',
  'Operated_or_Branded_Code_Share_Partners',
  'DOT_ID_Marketing_Airline',
  'IATA_Code_Marketing_Airline',
  'Flight_Number_Marketing_Airline',
  'Operating_Airline',
  'DOT_ID_Operating_Airline',
  'IATA_Code_Operating_Airline',
  'Tail_Number',
  'Flight_Number_Operating_Airline',
  'OriginAirportID',
  'OriginAirportSeqID',
  'OriginCityMarketID',
  'OriginCityName',
  'OriginState',
  'OriginStateFips',
  'OriginStateNam

## Load data

##### Pandas:

In [6]:
# read the file using config file
from datetime import datetime
import pandas as pd

file_type = config_data['file_type']
file_path = config_data['file_path']

t1 = datetime.now()

df = pd.read_csv(file_path, config_data['inbound_delimiter'])

t2 = datetime.now()

time_pandas = t2-t1

df.head()

  df = pd.read_csv(file_path, config_data['inbound_delimiter'])


Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
0,2021-03-03,SkyWest Airlines Inc.,SGU,PHX,False,False,724,714.0,0.0,-10.0,...,724.0,813.0,5.0,843,-25.0,0.0,-2.0,0800-0859,2,0.0
1,2021-03-03,SkyWest Airlines Inc.,PHX,SGU,False,False,922,917.0,0.0,-5.0,...,940.0,1028.0,3.0,1040,-9.0,0.0,-1.0,1000-1059,2,0.0
2,2021-03-03,SkyWest Airlines Inc.,MHT,ORD,False,False,1330,1321.0,0.0,-9.0,...,1336.0,1445.0,16.0,1530,-29.0,0.0,-2.0,1500-1559,4,0.0
3,2021-03-03,SkyWest Airlines Inc.,DFW,TRI,False,False,1645,1636.0,0.0,-9.0,...,1703.0,1955.0,7.0,2010,-8.0,0.0,-1.0,2000-2059,4,0.0
4,2021-03-03,SkyWest Airlines Inc.,PHX,BFL,False,False,1844,1838.0,0.0,-6.0,...,1851.0,1900.0,3.0,1925,-22.0,0.0,-2.0,1900-1959,2,0.0


In [7]:
print(f"It take {time_pandas} to load dataset using Pandas")

It take 0:00:29.122581 to load dataset using Pandas


##### Dask

In [8]:
import dask.dataframe as dd

t1 = datetime.now()

df = dd.read_csv(file_path, delimiter = config_data['inbound_delimiter'])

t2 = datetime.now()

time_dask = t2-t1

df.head()

Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
0,2021-03-03,SkyWest Airlines Inc.,SGU,PHX,False,False,724,714.0,0.0,-10.0,...,724.0,813.0,5.0,843,-25.0,0.0,-2.0,0800-0859,2,0.0
1,2021-03-03,SkyWest Airlines Inc.,PHX,SGU,False,False,922,917.0,0.0,-5.0,...,940.0,1028.0,3.0,1040,-9.0,0.0,-1.0,1000-1059,2,0.0
2,2021-03-03,SkyWest Airlines Inc.,MHT,ORD,False,False,1330,1321.0,0.0,-9.0,...,1336.0,1445.0,16.0,1530,-29.0,0.0,-2.0,1500-1559,4,0.0
3,2021-03-03,SkyWest Airlines Inc.,DFW,TRI,False,False,1645,1636.0,0.0,-9.0,...,1703.0,1955.0,7.0,2010,-8.0,0.0,-1.0,2000-2059,4,0.0
4,2021-03-03,SkyWest Airlines Inc.,PHX,BFL,False,False,1844,1838.0,0.0,-6.0,...,1851.0,1900.0,3.0,1925,-22.0,0.0,-2.0,1900-1959,2,0.0


In [9]:
print(f"It take {time_dask} to load dataset using Dask")

It take 0:00:00.007772 to load dataset using Dask


##### Modin

In [10]:
import modin.pandas

import os

os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask

t1 = datetime.now()

df = modin.pandas.read_csv(file_path, delimiter = config_data['inbound_delimiter'])

t2 = datetime.now()

time_modin = t2-t1

df.head()


    from distributed import Client

    client = Client()

2022-11-11 00:31:41,517 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/55/8329xk652qqcmwyx7rj48p6m0000gn/T/dask-worker-space/worker-27ie24qz', purging
2022-11-11 00:31:41,518 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/55/8329xk652qqcmwyx7rj48p6m0000gn/T/dask-worker-space/worker-vwu18sth', purging
2022-11-11 00:31:41,519 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/55/8329xk652qqcmwyx7rj48p6m0000gn/T/dask-worker-space/worker-o3_hlqki', purging
2022-11-11 00:31:41,520 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/55/8329xk652qqcmwyx7rj48p6m0000gn/T/dask-worker-space/worker-2_b5r0mt', purging
2022-11-11 00:31:41,521 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/55/8329xk652qqcmwyx7rj48p6m0000gn/T/dask-worker-space/worker-gj_8toar', purging
2022-

Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
0,2021-03-03,SkyWest Airlines Inc.,SGU,PHX,False,False,724,714.0,0.0,-10.0,...,724.0,813.0,5.0,843,-25.0,0.0,-2.0,0800-0859,2,0.0
1,2021-03-03,SkyWest Airlines Inc.,PHX,SGU,False,False,922,917.0,0.0,-5.0,...,940.0,1028.0,3.0,1040,-9.0,0.0,-1.0,1000-1059,2,0.0
2,2021-03-03,SkyWest Airlines Inc.,MHT,ORD,False,False,1330,1321.0,0.0,-9.0,...,1336.0,1445.0,16.0,1530,-29.0,0.0,-2.0,1500-1559,4,0.0
3,2021-03-03,SkyWest Airlines Inc.,DFW,TRI,False,False,1645,1636.0,0.0,-9.0,...,1703.0,1955.0,7.0,2010,-8.0,0.0,-1.0,2000-2059,4,0.0
4,2021-03-03,SkyWest Airlines Inc.,PHX,BFL,False,False,1844,1838.0,0.0,-6.0,...,1851.0,1900.0,3.0,1925,-22.0,0.0,-2.0,1900-1959,2,0.0


In [11]:
print(f"It take {time_modin} to load dataset using Modin")

It take 0:00:35.231856 to load dataset using Modin


##### Ray

In [None]:
import ray
# ray.init(dashboard_host="0.0.0.0")

t1 = datetime.now()

df = ray.data.read_csv(file_path, delimiter = config_data['inbound_delimiter'])

t2 = datetime.now()

time_ray = t2-t1

df.head()

2022-11-11 00:32:17,673	ERROR services.py:1403 -- Failed to start the dashboard: Failed to start the dashboard, return code 1
Failed to read dashboard log: [Errno 2] No such file or directory: '/tmp/ray/session_2022-11-11_00-32-16_236534_26504/logs/dashboard.log'
2022-11-11 00:32:17,675	ERROR services.py:1404 -- Failed to start the dashboard, return code 1
Failed to read dashboard log: [Errno 2] No such file or directory: '/tmp/ray/session_2022-11-11_00-32-16_236534_26504/logs/dashboard.log'
Traceback (most recent call last):
  File "/Users/yujh01/.local/lib/python3.10/site-packages/ray/_private/services.py", line 1375, in start_api_server
    with open(dashboard_log, "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/ray/session_2022-11-11_00-32-16_236534_26504/logs/dashboard.log'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/yujh01/.local/lib/python3.10/site-packages/ray/_private/services

## Validation

In [None]:
#validate the header of the file
util.col_header_val(df, config_data)

In [None]:
print("columns of files are:" ,df.columns)
print("columns of YAML are:" ,config_data['columns'])

In [None]:
if util.col_header_val(df,config_data)==0:
    print("validation failed")
    # write code to reject the file
else:
    print("col validation passed")
    # write the code to perform further action
    # in the pipleine