In [1]:
import csv
import json

In [2]:
def parseRow(row, headers, datatypes):
    if len(row) == 0:
        return None
    
    result = {}
    for i, element in enumerate(row):
        dtype = datatypes[i]
        header = headers[i]
        obj = ""
        
        if len(header) == 0:
            continue
            
        ### Handle the following 'data types':
        #   unique, string, category, number, list_custom1, list, map
        if dtype == "unique":
            obj = element
        elif dtype == "string":
            if len(element) == 0:
                obj = ""
            else:
                obj = element
        elif "category" in dtype:
            obj = element
        elif dtype == "number":
            obj = int(element)
        elif dtype == "list_custom1":
            obj = []
            if element:
                for item in element.split(';;'):
                    d = {}
                    for pair in item.split(';'):
                        item, value = pair.split(":")
                        d[item.strip()] = value.strip()
                    obj.append(d)
        elif "list" in dtype:
            obj = []
            if element:
                for item in element.split(';'):
                    obj.append(item.strip())
        elif "map" in dtype:
            obj = {}
            if element:
                for pair in element.split(";"):
                    item, value = pair.split(":")
                    obj[item.strip()] = value.strip()
                
        result[header] = obj
        
    return result

In [3]:
# JSON file will be created here
jsonPath = "../data/BedBugProductData.json"

# Source product data file has one row per product
csvPath = "../data/BedBugProductData.csv"


# Source ORPA data file has multiple lines per product, not all products have rows
# 'ORPA' stands for OtherReferencedProductAttributes
orpaPath = "../data/OtherReferencedProductAttributes.csv"
orpaData = {} # map product id -> list of products' rows in ORPA file

# Read and parse the ORPA data
with open(orpaPath, 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=",", quotechar='"')
    header = next(reader)
    
    # read in all data
    productOrpaData = []
    for row in reader:
        productOrpaData.append(row)
    # sort by product id (column 0)
    productOrpaData.sort(key=lambda row: row[0])
    
    # reduce data down to 1 row per product
    currId = -1
    productOrpaList = []
    for row in productOrpaData:
        row_d = {header[i]: row[i] for i in range(len(header))}
        nextId = row_d.pop("id")
        if currId == -1:
            currId = nextId
        if currId == nextId:
            productOrpaList.append(row_d)
        else:
            orpaData[currId] = productOrpaList
            productOrpaList = [row_d]
            currId = nextId
    orpaData[currId] = productOrpaList

# Read, parse, and store the remaining product data as JSON file
with open(jsonPath, 'w') as jsonfile:
    with open(csvPath, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=",", quotechar='"')
        
        datatypes = next(reader)
        headers = next(reader)
        data = []
        
        for row in reader:
            parsedRow = parseRow(row, headers, datatypes)
            if parsedRow is not None:
                productId = parsedRow["id"]
                productOrpa = orpaData.get(productId)
                if productOrpa:
                    parsedRow["otherReferencedProductAttributes"] = productOrpa
                    
                data.append(parsedRow)
        
        json.dump(data, jsonfile)

In [7]:
from pathlib import Path

In [8]:
# Create list of PDF files in static directory and save as resource_list.json in data directory
with open("../data/resource_list.json", 'w') as f:
    files = Path("../static/").glob("*")
    fns = [[file.name[:-4], file.name] for file in files if file.name.endswith('.pdf')]
    unannotated_fns = [[fn.split('_')[0], orig] for fn, orig in fns]
    split1 = [fn.split('.') + [orig] for fn, orig in unannotated_fns]
    split2 = [[resourceId] + theMiddle.strip().split(' - ') + [orig] for resourceId, theMiddle, orig in split1]
    d = {"labAndField": {}, "labAndFieldSummary": {}, "specimenLabel": {}, "safetyDataSheet": {}, "allOtherLinks": {}}
    for line in split2:
        prodId = line[0]
        cat = line[2]
        fn = line[-1]
        if cat in ('Lab Study', 'Field Study', 'Lab Study and Field Study'):
            studyCat = line[3]
            if studyCat == 'Clean':
                d["labAndField"][prodId] = fn
            elif studyCat == 'Summary':
                d["labAndFieldSummary"][prodId] = fn
            elif studyCat == 'Marked':
                pass
            else:
                print(line)
        elif cat in ('Label'):
            d["specimenLabel"][prodId] = fn
        elif cat in ('SDS'):
            d["safetyDataSheet"][prodId] = fn
        else:
            if cat != fn:
                if prodId in d["allOtherLinks"]:
                    d["allOtherLinks"][prodId].append([cat, fn])
                else:
                    d["allOtherLinks"][prodId] = [[cat, fn]]
            else:
                print(line)
    
    json.dump(d, f)

['13a', 'MGK Bed Bug Control Tips', '13a. MGK Bed Bug Control Tips.pdf']
['13a', 'MGK Bed Bug Protocol', '13a. MGK Bed Bug Protocol.pdf']
['33', 'Zoecon Protocol for Commercial and Residential Accounts', '33. Zoecon Protocol for Commercial and Residential Accounts.pdf']
['34', 'Zoecon Hotel & Bedroom Protocol', '34. Zoecon Hotel & Bedroom Protocol.pdf']
['35', 'Zoecon Zenprox Network of Control', '35. Zoecon Zenprox Network of Control.pdf']
['3a', 'CB-80', 'Lab Study', '3a. CB-80 - Lab Study.pdf']
['64b', 'Cimi-Shield Website Sheet', '64b. Cimi-Shield Website Sheet.pdf']
