In [34]:
import os
import glob
import json
import pandas as pd
from datetime import datetime
from row import Row

import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, trim
from pyspark.sql.types import IntegerType, LongType, FloatType
from pandas_schema import Column, Schema
import pandas_schema.validation as validation 

from cleaning import validation_schema

# Cleaning

This step will allow you to remove invalid data from the datasets. An example of invalid data is a negative value for a data representing a distance. For each column in the 4 sub-datasets, assertion are maded based on the TLC specifications.

### 1. Identification of the assertions

<img src="img/assertion_fhv_fhvhv.png" width="800" align="left"/>
<img src="img/assertion_green.png" width="800" align="left"/>
<img src="img/assertion_yellow.png" width="800" align="left"/>

### 2. Pandas-schema

A python module that checks such assertion exists: _panda_schema_. This module is very easy to use, an example is given for the FHV dataset (the implementation for the other sub-datasets can be found in the _cleaning.py_ file).

In [None]:
from pandas_schema import Column, Schema
import pandas_schema.validation as validation 

# validation schema for fhv dataset
schema_fhv = Schema([
    Column('dispatching_base_num', 
           [validation.MatchesPatternValidation('^B[0-9]{5}$')], 
           allow_empty=True),
    Column('pickup_datetime', 
           [validation.DateFormatValidation('%Y-%m-%d %H:%M:%S')],
           allow_empty=True),
    Column('dropoff_datetime', 
           [validation.DateFormatValidation('%Y-%m-%d %H:%M:%S')],
           allow_empty=True),
    Column('pulocationid',
           [validation.InRangeValidation(1, 266)],
           allow_empty=True),
    Column('dolocationid', 
           [validation.InRangeValidation(1, 266)],
           allow_empty=True),
    Column('sr_flag', 
           [validation.InListValidation([1, None])],
           allow_empty=True)
])


def validate(data, schema, validation_schema):

    """
    Validate the entries of the row with
    the validation schema

    """

    # validate the data
    df = pd.DataFrame([data], columns=schema)
    errors = validation_schema.validate(df)

    validated = len(errors) == 0

    return validated

the _validate_ function is a static method of the Row class

### 3. Implementation

#### 3.1. Basic cleaning operations

Some records contain errors that are not validity errors but display errors. These errors are :

* trailing space: "B0005    "
* leading space: "    B0005"
* int/float represented as a string: "12"
* replace empty cell of numeric column by: 0

This is done by loading each .csv file in a spark dataframe . We apply the following operations:

* trim: to remove leading and trailing whithespace.
* cast: to convert string to float
* fillna: to fill the empty cells in the numeric columns.

A dictionary is used to specify on which columns to apply these last two operations.

#### 3.2. Validation step

A version of pandas-schema that runs on the pyspark dataframe does not seem to exist at the moment. So we need to load the pyspark dataframe on the main node using the _toPandas()_ method. The validation scheme corresponding to the appropriate dataset is then applied. 

With the errors generated by the validation step, 2 things can be done:

* Rows with errors are deleted.
* Errors are saved to generate statistics on them.

In [3]:
os.environ['HADOOP_CONF_DIR']="/etc/hadoop/conf"

# python configuration
os.environ['PYSPARK_PYTHON']="/usr/local/anaconda3/bin/python"
os.environ['PYSPARK_DRIVER_PYTHON']="/usr/local/anaconda3/bin/python"

from pyspark.sql import SparkSession
from pyspark import SparkFiles, SQLContext


# remove old spark session
try: 
    spark
    print("Spark application already started. Terminating existing application and starting new one")
    spark.stop()
except: 
    pass

# Create a new spark session, with YARN as resource manager, requesting 4 worker nodes.
spark = SparkSession \
    .builder \
    .master("yarn") \
    .config("spark.executor.instances","4") \
    .appName("project_ceci18") \
    .getOrCreate()

# Create spark context
sc=spark.sparkContext
sqlContext = SQLContext(sc)

In [5]:
# get all the filename
hdfs_path = 'hdfs://public00:8020/user/hpda000034/infoh600/clean'
local_path = '/home/hpda00034/infoh600/sampled'

In [37]:
cleaning_conf = {
    'green': {
        'cast': {
            'int': ['pulocationid', 'dolocationid']
        },
        'fill': {
            'extra':0, 'mta_tax':0, 'fare_amount':0, 
            'ehail_fee':0, 'tolls_amount':0
        }
    },
    'fhv':{
        'cast': {
            'int': ['pulocationid', 'dolocationid', 'sr_flag']
        },
        'fill': {
        }
    }
}


In [52]:
dataset = "fhv"

filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

n_trips = []
months = []

cast = cleaning_conf[dataset]['cast'] 
fill = cleaning_conf[dataset]['fill'] 

errors_df = []
filenames = ['fhv_tripdata_2019-01.csv', 'fhv_tripdata_2019-02.csv']
for filename in filenames:
    print(filename)
    trips = sqlContext.read.csv("./integrated/{}/{}".format(dataset, filename), 
                                header=True,
                                inferSchema=True).fillna(fill)
    
    for column in trips.columns:
        trips = trips.withColumn(column, trim(trips[column]))
          
    for type_, columns in cast.items():
        for column in columns:
            if type_ == 'int':
                trips = trips.withColumn(column, trips[column].cast(IntegerType()))
            elif type_ == 'float':
                trips = trips.withColumn(column, trips[column].cast(FloatType()))
    
    df = trips.toPandas()
    errors = [error + [filename] for error 
               in Row.validate(df, validation_schema[dataset])]
    errors_df += errors
    rows = [error[0] for error in errors]
    
    if len(rows) != 0:
        df = df.drop(rows, axis=0)
        df.to_csv('../clean/{}/{}'.format('buffer', filename), index=False)
    else:
        df.to_csv('../clean/{}/{}'.format('fhv', filename), index=False)
    
columns = ['row', 'column', 'value', 'file']
errors_df = pd.DataFrame(errors_df, columns=columns)
errors_df.to_csv('../invalid_data/{}.csv'.format(dataset), index=False)

fhv_tripdata_2019-01.csv
fhv_tripdata_2019-02.csv


In [None]:
# Stop spark
try: 
    spark.stop()
except: 
    pass

In [54]:
!hadoop fs -copyFromLocal /home/ceci18/clean/fhv/* ./clean/fhv
!hadoop fs -copyFromLocal /home/ceci18/clean/buffer/* ./clean/fhv
!rm /home/ceci18/clean/buffer/*

### 4. Statistics

In this section we will summarize the validation errors found. For each dataset we will represent the distribution of the errors according to the different files and the distribution of the errors according to the different columns.

In [None]:
dataset = "green"

path = '/home/ceci18/invalid_data'

df = pd.read_csv('{}/{}.csv'.format(path, dataset))

if df.shape[0] > 0:
    fig, (ax1, ax2) = plt.subplots(2,1, figsize = (20,16))
    fig.tight_layout(pad=20.0)

    errors_by_column = df.groupby(['column']).size()
    errors_by_column.plot.bar(ax=ax1)
    ax1.set_title('Count of errors by column')
    ax1.set_ylabel('count')

    errors_by_file = df.groupby(['file']).size()
    errors_by_file.plot.bar(ax=ax2)
    ax2.set_title('Count of errors by file')
    ax2.set_ylabel('count')

    plt.savefig('figures/errors_{}.png'.format(dataset))
    plt.close()
    
else:
    print("no error for {}".format(dataset))