In [135]:
import pandas as pd
from io import StringIO
import dateutil.parser as dp
import airtable
import json

In [136]:
# yk = pd.read_csv("../sample_data/Yukon.csv")

In [137]:
col_df = airtable.get_df()

In [138]:
col_df.set_index("organization", inplace=True)

In [139]:
del col_df["id"]

In [172]:
def get_sample_data(col_df, location_code, raw_df):
    sample_data = pd.DataFrame(columns=col_df.columns)
    for col in col_df.columns:
        if pd.isna(col_df[col][location_code]):
            continue
        raw_col = col_df[col][location_code].strip()
        if "," in raw_col:
            # we need to map more than 1 column, so skip for now
            continue
        sample_data[col] = raw_df[raw_col].copy()   
    return sample_data

## Federal

In [179]:
fed = pd.read_csv("../sample_data/Federal.csv")

In [180]:
# from https://stackoverflow.com/a/4766400

tz_str = '''-12 Y
-11 X NUT SST
-10 W CKT HAST HST TAHT TKT
-9 V AKST GAMT GIT HADT HNY
-8 U AKDT CIST HAY HNP PST PT
-7 T HAP HNR MST PDT
-6 S CST EAST GALT HAR HNC MDT
-5 R CDT COT EASST ECT EST ET HAC HNE PET
-4 Q AST BOT CLT COST EDT FKT GYT HAE HNA PYT
-3 P ADT ART BRT CLST FKST GFT HAA PMST PYST SRT UYT WGT
-2 O BRST FNT PMDT UYST WGST
-1 N AZOT CVT EGT
0 Z EGST GMT UTC WET WT
1 A CET DFT WAT WEDT WEST
2 B CAT CEDT CEST EET SAST WAST
3 C EAT EEDT EEST IDT MSK
4 D AMT AZT GET GST KUYT MSD MUT RET SAMT SCT
5 E AMST AQTT AZST HMT MAWT MVT PKT TFT TJT TMT UZT YEKT
6 F ALMT BIOT BTT IOT KGT NOVT OMST YEKST
7 G CXT DAVT HOVT ICT KRAT NOVST OMSST THA WIB
8 H ACT AWST BDT BNT CAST HKT IRKT KRAST MYT PHT SGT ULAT WITA WST
9 I AWDT IRKST JST KST PWT TLT WDT WIT YAKT
10 K AEST ChST PGT VLAT YAKST YAPT
11 L AEDT LHDT MAGT NCT PONT SBT VLAST VUT
12 M ANAST ANAT FJT GILT MAGST MHT NZST PETST PETT TVT WFT
13 FJST NZDT
11.5 NFT
10.5 ACDT LHST
9.5 ACST
6.5 CCT MMT
5.75 NPT
5.5 SLT
4.5 AFT IRDT
3.5 IRST
-2.5 HAT NDT
-3.5 HNT NST NT
-4.5 HLV VET
-9.5 MART MIT'''

tzd = {}
for tz_descr in map(str.split, tz_str.split('\n')):
    tz_offset = int(float(tz_descr[0]) * 3600)
    for tz_code in tz_descr[1:]:
        tzd[tz_code] = tz_offset

In [181]:
def parse_fed_time(x):
    y = x.split(" ")
    clean_str = y[0] + " " + y[1] + " " + y[-1][1:-1]
    return dp.parse(clean_str, tzinfos=tzd)

In [182]:
fed = fed.query("language == 'English'").copy()

In [183]:
sample_data_fed = get_sample_data(col_df, "federal", fed)

In [185]:
sample_data_fed["closing_date"] = sample_data_fed["closing_date"].map(parse_fed_time)
sample_data_fed["closing_date"] = sample_data_fed["closing_date"].map(lambda x: str(x))

In [186]:
# I need to parse province_territory_of_work into a list of 2 letter codes
# ref: https://www12.statcan.gc.ca/census-recensement/2011/ref/dict/table-tableau/table-tableau-8-eng.cfm
prov_terr_dict = {
    "Newfoundland and Labrador": "NL",
    "Prince Edward Island": "PE",
    "Nova Scotia": "NS",
    "New Brunswick": "NB",
    "Quebec": "QC",
    "Ontario": "ON",
    "Manitoba": "MB",
    "Saskatchewan": "SK",
    "Alberta": "AB",
    "British Columbia": "BC",
    "Yukon": "YK",
    "Northwest Territories": "NT",
    "Nunavut": "NU",
    "National Capital Region": "ON, QC",
    "Canada": "NL, PE, NS, NB, QC, ON, MB, SK, AB, BC, YK, NT, NU",
    # will have to think about the locations below and how they will be represented in filtering
    "Aboriginal Lands": "",
    "Foreign": "",
    "United States": "",
    "World": "",
    "Europe": "",
    "Mexico": "",
    "Unspecified": ""
}

def replace_with_acronyms(x):
    for k in prov_terr_dict:
        x = x.replace(k, prov_terr_dict[k])
    return [y.strip() for y in x.split(",") if y]


In [187]:
sample_data_fed["province_territory_of_work"] = sample_data_fed["province_territory_of_work"].map(replace_with_acronyms)

In [188]:
def is_capital(letter):
    return letter.lower() != letter

def get_gsins(field):
    gsins = []
    for x in field.split(","):
        parts = x.strip().split(" - ")
        if len(parts) < 2:
            continue
        maybe_code = parts[0]
        if len(maybe_code) < 3:
            continue
        # 2nd char of code must be a capital letter or number, 3rd char a digit
        char_1 = maybe_code[1]
        char_2 = maybe_code[2]
        if is_capital(char_1) or char_1.isdigit():
            gsins.append(parts[0])
    return gsins

In [189]:
sample_data_fed["GSIN_code"] = sample_data_fed["GSIN_code"].map(get_gsins)

In [190]:
fed_url = "https://buyandsell.gc.ca/procurement-data/tender-notice/"
sample_data_fed["URL"] = sample_data_fed["URL"].map(lambda x: fed_url + x.replace("$", "") )

## BC

In [191]:
bc = pd.read_csv("../sample_data/British_Columbia.csv")

In [192]:
sample_data_bc = get_sample_data(col_df, "bc", bc)

In [194]:
sample_data_bc["province_territory_of_work"] = sample_data_bc["province_territory_of_work"].map(lambda x: [x])

In [195]:
sample_data_bc["buyer_name"] = sample_data_bc["buyer_name"].map(lambda x: x.split("\n")[2])

In [196]:
sample_data_bc["published_date"] = sample_data_bc["published_date"].map(lambda x: str(pd.to_datetime(str(x)).date()))
bc["CLOSING_DATETIME"] = bc["CLOSING_DATE"].map(str) + " " + bc["CLOSING_TIME"].map(lambda x: x.split(" ")[0]) + " " + bc["CLOSING_TIME"].map(lambda x: x.split(" ")[1]).map({"Local": "PST", "Pacific": "PST", "Mountain": "MST"})
sample_data_bc["closing_date"] = bc["CLOSING_DATETIME"].map(lambda x: str(dp.parse(x, tzinfos=tzd)))

In [197]:
sample_data_bc["GSIN_code"] = sample_data_bc["GSIN_code"].map(lambda x: str(x).split(" "))

## Nunavut

In [198]:
nu = pd.read_csv("../sample_data/Nunuvat.csv")

In [199]:
sample_data_nu = get_sample_data(col_df, "nu", nu)

In [201]:
def get_nu_location(_location):
    location_dict = {"Nunavut": "NU"}
    parts = _location.split(",")
    if len(parts) == 1:
        return location_dict[parts[0]]
    
    return parts[-1].strip()

In [202]:
sample_data_nu["province_territory_of_work"] = sample_data_nu["province_territory_of_work"].map(get_nu_location)

In [203]:
abc1 = sample_data_nu.closing_date[0]
abc2 = sample_data_nu.closing_date[2]

In [204]:
sample_data_nu["closing_date"] = sample_data_nu["closing_date"].map(lambda x: x if "EDT" in x else x + " 16:00 EDT").map(lambda x: str(dp.parse(x, tzinfos=tzd)))

## Alberta

In [162]:
ab = pd.read_csv("../sample_data/Alberta.csv")

In [213]:
sample_data_ab = get_sample_data(col_df, "ab", ab)

In [223]:
sample_data_ab["GSIN_code"] = sample_data_ab["GSIN_code"].map(lambda x: x.split("  ") if x == x else []).map(lambda x: [y.split(" ")[0] for y in x])

In [230]:
sample_data_ab["tender_description"] = ab["ShortDescription"].map(lambda x: x + "\n" if x==x else "") + ab["LongDescription"].map(lambda x: x if x==x else "")

In [238]:
sample_data_ab["buyer_contact"] =  ab["ContactFirstName"] + " " + ab["ContactLastName"] + "\n" + ab["ContactPhone"]

## Assemble sample data

In [244]:
sample_data = pd.DataFrame(columns=col_df.keys())

In [245]:
sample_data = pd.concat([sample_data, sample_data_fed], sort=False)
sample_data = pd.concat([sample_data, sample_data_bc], sort=False)
sample_data = pd.concat([sample_data, sample_data_ab], sort=False)

In [246]:
sample_data.index = range(len(sample_data))

In [247]:
sample_data["id"] = range(len(sample_data))

## Save sample data

In [248]:
sample_data.to_json("../sample_data/sample_data.json", orient="records")

In [249]:
sample_data.to_json("../../single-point-of-access-prototype/data/sample_data.json", orient="records")