In [20]:
import gzip
import json
import os
import csv
import numpy as np
from scipy import stats

In [21]:
# prepare column names data lookup dicts

In [22]:
with gzip.open("../../data/schemapile.json.gz", 'r') as f:
    schemapile = json.loads(f.read().decode('utf-8'))

In [24]:
from collections import Counter

column_names = []
tables_count = 0
for schema in schemapile:
    for table in schemapile[schema]["TABLES"]:
        for column_name in schemapile[schema]["TABLES"][table]["COLUMNS"]:
            column_names.append(column_name.lower())
        tables_count += 1
column_names_schemapile = Counter(column_names)

In [None]:
# load ground truth labels

In [25]:
pollock_folder = "pollock/"

In [26]:
import os
pollock_eval = {}
for file in sorted(os.listdir(pollock_folder+"csv/")):
    pollock_params = json.load(open(f"{pollock_folder}/parameters/{file}_parameters.json"))
    ground_truth = int(pollock_params["header_lines"])+int(pollock_params["preamble_lines"])
    pollock_eval[pollock_folder+"csv/"+file] = ground_truth

In [None]:
# schemapile lookup method

In [28]:
def find_outliers(data):
    sorted_indices = list(np.argsort(data))
    if len(sorted_indices) >2:
        # Check if the maximum value is significantly larger than the mean of the other values
        if sorted_indices[-1]+1 < len(sorted_indices) and data[sorted_indices[-1]] > 10*data[sorted_indices[-1]+1]:
            outlier_index = [sorted_indices[-1]]
        else:
            outlier_index = []
    else:
        outlier_index = []
    return outlier_index


def get_header_row_by_lookup(file_content):
    matches_per_row = []
    max_match_count = 0

    rows = file_content.replace(";", ",").replace("\"", "").replace("\t", ",").replace("'","").replace(" ","_").lower().split("\n")
    for i, row in enumerate(rows):   
        if i > 20:
            break
            
        if len(row.split()) == 0:
            continue

        column_values = row.split(",")
                
        total_match_count = 0
        for column_value in column_values:
            total_match_count += column_names_schemapile[column_value]

        matches_per_row.append(total_match_count)
    
    # testing the function
    potentials_headers = find_outliers(matches_per_row)
    if len(potentials_headers):
        return max(potentials_headers) + 1, matches_per_row
    else:
        return 0, matches_per_row

In [29]:
def evaluate(eval_data):
    matches_lookup = 0
    matches_py_csv = 0
    matches_hybrid = 0
    for csv_file_name in eval_data:
        ground_truth = eval_data[csv_file_name]

        with open(csv_file_name, newline='') as csv_file:
            csv_file_content = csv_file.read()[:10000]
            lookup, matches_per_row = get_header_row_by_lookup(csv_file_content)

            try:    
                pycsv = csv.Sniffer().has_header(csv_file_content)
            except Exception as e:
                pycsv = None
                
            hybrid = 1 if pycsv==1 else lookup

        if ground_truth == lookup:
            matches_lookup += 1
        if ground_truth == pycsv:
            matches_py_csv += 1
        if ground_truth == hybrid:
            matches_hybrid += 1
            
    print(f"matches lookup: {matches_lookup}/100")
    print(f"matches py_csv: {matches_py_csv}/100")
    print(f"matches hybrid: {matches_hybrid}/100")

In [None]:
# evaluate approaches

In [None]:
evaluate(pollock_eval)