In [104]:
import pandas as pd
from io import StringIO
import dateutil.parser as dp
import airtable
import json

In [105]:
# yk = pd.read_csv("../sample_data/Yukon.csv")

In [106]:
col_df = airtable.get_df()

In [107]:
col_df.set_index("organization", inplace=True)

In [108]:
del col_df["id"]

## Federal

In [109]:
fed = pd.read_csv("../sample_data/Federal.csv")

In [110]:
# from https://stackoverflow.com/a/4766400

tz_str = '''-12 Y
-11 X NUT SST
-10 W CKT HAST HST TAHT TKT
-9 V AKST GAMT GIT HADT HNY
-8 U AKDT CIST HAY HNP PST PT
-7 T HAP HNR MST PDT
-6 S CST EAST GALT HAR HNC MDT
-5 R CDT COT EASST ECT EST ET HAC HNE PET
-4 Q AST BOT CLT COST EDT FKT GYT HAE HNA PYT
-3 P ADT ART BRT CLST FKST GFT HAA PMST PYST SRT UYT WGT
-2 O BRST FNT PMDT UYST WGST
-1 N AZOT CVT EGT
0 Z EGST GMT UTC WET WT
1 A CET DFT WAT WEDT WEST
2 B CAT CEDT CEST EET SAST WAST
3 C EAT EEDT EEST IDT MSK
4 D AMT AZT GET GST KUYT MSD MUT RET SAMT SCT
5 E AMST AQTT AZST HMT MAWT MVT PKT TFT TJT TMT UZT YEKT
6 F ALMT BIOT BTT IOT KGT NOVT OMST YEKST
7 G CXT DAVT HOVT ICT KRAT NOVST OMSST THA WIB
8 H ACT AWST BDT BNT CAST HKT IRKT KRAST MYT PHT SGT ULAT WITA WST
9 I AWDT IRKST JST KST PWT TLT WDT WIT YAKT
10 K AEST ChST PGT VLAT YAKST YAPT
11 L AEDT LHDT MAGT NCT PONT SBT VLAST VUT
12 M ANAST ANAT FJT GILT MAGST MHT NZST PETST PETT TVT WFT
13 FJST NZDT
11.5 NFT
10.5 ACDT LHST
9.5 ACST
6.5 CCT MMT
5.75 NPT
5.5 SLT
4.5 AFT IRDT
3.5 IRST
-2.5 HAT NDT
-3.5 HNT NST NT
-4.5 HLV VET
-9.5 MART MIT'''

tzd = {}
for tz_descr in map(str.split, tz_str.split('\n')):
    tz_offset = int(float(tz_descr[0]) * 3600)
    for tz_code in tz_descr[1:]:
        tzd[tz_code] = tz_offset

In [111]:
def parse_fed_time(x):
    y = x.split(" ")
    clean_str = y[0] + " " + y[1] + " " + y[-1][1:-1]
    return dp.parse(clean_str, tzinfos=tzd)

In [112]:
sample_data_fed = pd.DataFrame(columns=col_df.columns)
for col in col_df.columns:
    fed_col = col_df[col]["federal"]
    if fed_col == fed_col:
        sample_data_fed[col] = fed.query("language == 'English'")[fed_col].copy()        

In [113]:
sample_data_fed["closing_date"] = sample_data_fed["closing_date"].map(parse_fed_time)
sample_data_fed["closing_date"] = sample_data_fed["closing_date"].map(lambda x: str(x))

In [114]:
# I need to parse province_territory_of_work into a list of 2 letter codes
# ref: https://www12.statcan.gc.ca/census-recensement/2011/ref/dict/table-tableau/table-tableau-8-eng.cfm
prov_terr_dict = {
    "Newfoundland and Labrador": "NL",
    "Prince Edward Island": "PE",
    "Nova Scotia": "NS",
    "New Brunswick": "NB",
    "Quebec": "QC",
    "Ontario": "ON",
    "Manitoba": "MB",
    "Saskatchewan": "SK",
    "Alberta": "AB",
    "British Columbia": "BC",
    "Yukon": "YK",
    "Northwest Territories": "NT",
    "Nunavut": "NU",
    "National Capital Region": "ON, QC",
    "Canada": "NL, PE, NS, NB, QC, ON, MB, SK, AB, BC, YK, NT, NU",
    # will have to think about the locations below and how they will be represented in filtering
    "Aboriginal Lands": "",
    "Foreign": "",
    "United States": "",
    "World": "",
    "Europe": "",
    "Mexico": "",
    "Unspecified": ""
}

def replace_with_acronyms(x):
    for k in prov_terr_dict:
        x = x.replace(k, prov_terr_dict[k])
    return [y.strip() for y in x.split(",") if y]


In [115]:
sample_data_fed["province_territory_of_work"] = sample_data_fed["province_territory_of_work"].map(replace_with_acronyms)

In [116]:
def is_capital(letter):
    return letter.lower() != letter

def get_gsins(field):
    gsins = []
    for x in field.split(","):
        parts = x.strip().split(" - ")
        if len(parts) < 2:
            continue
        maybe_code = parts[0]
        if len(maybe_code) < 3:
            continue
        # 2nd char of code must be a capital letter or number, 3rd char a digit
        char_1 = maybe_code[1]
        char_2 = maybe_code[2]
        if is_capital(char_1) or char_1.isdigit():
            gsins.append(parts[0])
    return gsins

In [68]:
sample_data_fed["GSIN_code"] = sample_data_fed["GSIN_code"].map(get_gsins)

In [118]:
fed_url = "https://buyandsell.gc.ca/procurement-data/tender-notice/"
sample_data_fed["URL"].map(lambda x: fed_url + x.replace("$", "") )

0       https://buyandsell.gc.ca/procurement-data/tend...
2       https://buyandsell.gc.ca/procurement-data/tend...
4       https://buyandsell.gc.ca/procurement-data/tend...
6       https://buyandsell.gc.ca/procurement-data/tend...
8       https://buyandsell.gc.ca/procurement-data/tend...
                              ...                        
1512    https://buyandsell.gc.ca/procurement-data/tend...
1514    https://buyandsell.gc.ca/procurement-data/tend...
1516    https://buyandsell.gc.ca/procurement-data/tend...
1518    https://buyandsell.gc.ca/procurement-data/tend...
1520    https://buyandsell.gc.ca/procurement-data/tend...
Name: URL, Length: 761, dtype: object

## BC

In [None]:
bc = pd.read_csv("../sample_data/British_Columbia.csv")

In [70]:
sample_data_bc = pd.DataFrame(columns=col_df.columns)
for col in col_df.columns:
    bc_col = col_df[col]["bc"]
    if pd.isna(bc_col):
        continue
    bc_col = bc_col.strip()
    if "," in bc_col:
        # we need to map more than 1 column, so skip for now
        continue
    sample_data_bc[col] = bc[bc_col].copy()        

In [71]:
sample_data_bc["province_territory_of_work"] = sample_data_bc["province_territory_of_work"].map(lambda x: [x])

In [72]:
sample_data_bc["buyer_name"] = sample_data_bc["buyer_name"].map(lambda x: x.split("\n")[2])

In [73]:
sample_data_bc["published_date"] = sample_data_bc["published_date"].map(lambda x: str(pd.to_datetime(str(x)).date()))
bc["CLOSING_DATETIME"] = bc["CLOSING_DATE"].map(str) + " " + bc["CLOSING_TIME"].map(lambda x: x.split(" ")[0]) + " " + bc["CLOSING_TIME"].map(lambda x: x.split(" ")[1]).map({"Local": "PST", "Pacific": "PST", "Mountain": "MST"})
sample_data_bc["closing_date"] = bc["CLOSING_DATETIME"].map(lambda x: str(dp.parse(x, tzinfos=tzd)))

In [74]:
sample_data_bc["GSIN_code"] = sample_data_bc["GSIN_code"].map(lambda x: str(x).split(" "))

## Nunavut

In [None]:
nu = pd.read_csv("../sample_data/Nunuvat.csv")

In [77]:
sample_data_nu = pd.DataFrame(columns=col_df.columns)
for col in col_df.columns:
    if pd.isna(col_df[col]["nu"]):
        continue
    nu_col = col_df[col]["nu"].strip()
    if "," in nu_col:
        # we need to map more than 1 column, so skip for now
        continue
    if nu_col == nu_col:
        sample_data_nu[col] = nu[nu_col].copy()        

In [78]:
def get_nu_location(_location):
    location_dict = {"Nunavut": "NU"}
    parts = _location.split(",")
    if len(parts) == 1:
        return location_dict[parts[0]]
    
    return parts[-1].strip()

In [79]:
sample_data_nu["province_territory_of_work"] = sample_data_nu["province_territory_of_work"].map(get_nu_location)

In [80]:
abc1 = sample_data_nu.closing_date[0]
abc2 = sample_data_nu.closing_date[2]

In [81]:
sample_data_nu["closing_date"] = sample_data_nu["closing_date"].map(lambda x: x if "EDT" in x else x + " 16:00 EDT").map(lambda x: str(dp.parse(x, tzinfos=tzd)))

## Alberta

In [None]:
ab = pd.read_csv("../sample_data/Alberta.csv")

## Assemble sample data

In [91]:
sample_data = pd.DataFrame(columns=col_df.keys())

In [92]:
sample_data = pd.concat([sample_data, sample_data_fed], sort=False)
sample_data = pd.concat([sample_data, sample_data_bc], sort=False)

In [93]:
sample_data.index = range(len(sample_data))

In [94]:
sample_data["id"] = range(len(sample_data))

## Save sample data

In [87]:
sample_data.to_json("../sample_data/sample_data.json", orient="records")

In [88]:
sample_data.to_json("../../single-point-of-access-prototype/data/sample_data.json", orient="records")

In [90]:
sample_data.head()

Unnamed: 0,GSIN_code,URL,buyer_contact,buyer_name,closing_date,location_of_work,procurement_rules,province_territory_of_work,published_date,selection_criteria,tender_description,title,trade_agreement,id
0,[JX1990A],,"DeBlois, Vincent, vincent.deblois@tpsgc-pwgsc....",Fisheries and Oceans Canada,2019-09-12 14:00:00-04:00,Quebec,All interested suppliers may submit a bid,[QC],2019-08-16,Lowest/Lower Bid,Trade Agreement: Canadian Free Trade Agreement...,CCGS Amundsen -Dry Dock and refit-Fall 2019 (F...,Canadian Free Trade Agreement (CFTA),0
1,"[E111A, JX1990H, N4235, N5430D]",,"Richards, Shazia, shazia.khan@tpsgc-pwgsc.gc.c...",Fisheries and Oceans Canada,2019-09-12 14:00:00-04:00,"British Columbia, Newfoundland and Labrador, N...",All interested suppliers may submit a bid,"[BC, NL, NS, ON, QC]",2019-06-21,Lowest/Lower Bid,Trade Agreement: NAFTA / CFTA / FTAs with Peru...,EREP: Self-Propelled Advancing Skimmer (F7047-...,"Canada-Panama Free Trade Agreement, Canada-Kor...",1
2,[T004KA],,"Westall, Susan, susan.westall@tpsgc-pwgsc.gc.c...",Public Works and Government Services Canada,2020-01-31 14:00:00-05:00,"Alberta, British Columbia, Manitoba, National ...",The bidder must supply Canadian goods and/or s...,"[AB, BC, MB, ON, QC, NB, NL, NT, NS, NU, ON, P...",2014-01-20,Subsequent/Follow-on Contracts,Trade Agreement: Canadian Free Trade Agreement...,Social Media Monitoring (EN578-141760/B),Canadian Free Trade Agreement (CFTA),2
3,[N7030],,"Niyonambaza (SMS div), Audace, audace.niyonamb...",Public Works and Government Services Canada,2023-10-02 14:00:00-04:00,National Capital Region,All interested suppliers may submit a bid,"[ON, QC]",2013-07-03,Subsequent/Follow-on Contracts,Trade Agreement: NONE Tendering Procedures: Al...,Software Licensing Supply Arrangement (EN578-1...,"None, None",3
4,"[T002AA, T002AB, T002AJ]",,"Kalp, Lynn, 613-938-5803, 866-246-6893, 111 Wa...",Parks Canada,2019-10-01 14:00:00-04:00,Canada,Open,"[NL, PE, NS, NB, QC, ON, MB, SK, AB, BC, YK, N...",2014-02-19,,"Request for Supply Arrangement, Parks Canada A...",RFSA -Exhibit Design - Parks Canada - National...,World Trade Organization-Agreement on Governme...,4
