Copyright 2023 Province of British Columbia

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at 

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under he License.
the License.

In [None]:
import pandas as pd
from autocorrect import Speller

# add our stuff to the path
import sys
import os
import re
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# import our stuff
from importlib import reload
from src import q34, connect, preprocess

import warnings
warnings.simplefilter(action='ignore')

In [None]:
# PARAMETERS

# info to gain access to database, IDIR restricted 
CRED_PATH = '../credentials.txt'

# which tables to access
RESPONSE_TABLE = 'dbo.AQ34CULTURE'
CODE_TABLE = 'dbo.AQ34CULTURE_Codes'
RESULTS_TABLE = 'dbo.AQ34CULTURE_RESULTS'

# Which Cycle to work on 
CYCLE = 1

In [None]:
reload(q34)
reload(preprocess)

In [None]:
# Read in all data required to build model
connection = connect.create_connection(CRED_PATH)

# actual responses
df_open = connect.fetch_table(RESPONSE_TABLE, connection)

# codes to match
df_codes = connect.fetch_table(CODE_TABLE, connection)

In [None]:
df_open.head()

In [None]:
df_codes.head()

In [None]:
df_codes.head()
code_list = df_codes.qc_desc.values
code_list

In [None]:
# grab only the cycle of current interest
df = df_open[df_open.cycle == CYCLE]
# reshape df to be one row per response, then drop all empty rows 
df = q34.melt_df(df)

In [None]:
df.head()

In [None]:
spell = Speller()

# get a list of all words that are directly in the code words 
all_words = []
for word in code_list:
    words = re.split(r'\sand\s|[,;()/\r\n\s]+', word)
    for x in words:
        if len(x) > 0:
            all_words.append(x)
            
# add additional words that are not inaccurate 
words = [
    'Salish', 
    'Collectivist', 
    'Konger', 
    'Kong', 
    'Slavic', 
    'Colonizer', 
    'Tagalog', 
    'Syilx', 
    'Panasian', 
    'Malayali', 
    'Jatt', 
    'Metis',
    'Romani',
    'Slovak',
    'Ilocano'
]

for word in all_words + words:

    for x in [word, word.upper(), word.lower()]:
        if x in spell.nlp_data:
            continue
            
        spell.nlp_data[word] = 100
        spell.nlp_data[word.upper()] = 100
        spell.nlp_data[word.lower()] = 100

In [None]:
clean_dict = {
    'response': [], 
    'cleaned': [],
    'translated': [],
    'translation_code': [],
    'exact_match': [],
    'partial_match': [],
    'exact_match_codes': [],
    'partial_match_codes': [],
    'likely_match_codes': []
}

n_rows = df.shape[0]

for idx, row in df.iterrows():

    x = row.response
    (response, 
     cleaned, translated, response_code, 
     has_exact, has_partial, 
     exact_match_codes, partial_match_codes, likely_match_codes) = q34.do_the_things(x, spell, code_list)
    
    clean_dict['response'].append(response)
    clean_dict['cleaned'].append(cleaned)
    clean_dict['translated'].append(translated)
    clean_dict['translation_code'].append(response_code)
    clean_dict['exact_match'].append(has_exact)
    clean_dict['partial_match'].append(has_partial)
    clean_dict['exact_match_codes'].append(exact_match_codes)
    clean_dict['partial_match_codes'].append(partial_match_codes)
    clean_dict['likely_match_codes'].append(likely_match_codes)
    
    pct_done = int(round(100*(idx+1)/n_rows))
    print_line = f'{idx+1:07,}/{n_rows:07,}   |' + '-'*(pct_done) + '>' + ' '*(100-pct_done) + '|'
    print(print_line, end = '\r')

In [None]:
clean_df = pd.DataFrame(clean_dict)

In [None]:
clean_df

In [None]:
exact = clean_df.exact_match.sum()
n_rows = clean_df.shape[0]
partial_no_exact = clean_df[~clean_df.exact_match].partial_match.sum()
n_rows_no_exact = clean_df[~clean_df.exact_match].shape[0]
print(f'Exact Matches: {exact:,}/{n_rows:,} ({exact/n_rows:.0%})')
print(f'Partial Matches: {partial_no_exact:,}/{n_rows_no_exact:,} ({partial_no_exact/n_rows_no_exact:.0%})')
print(f'Leftover: {n_rows - exact - partial_no_exact:,}/{n_rows:,} ({(n_rows - exact - partial_no_exact)/n_rows:.0%})')

In [None]:
engine = connect.create_connection(CRED_PATH, sqlalchemy=True)
connect.save_table(results_df, RESULTS_TABLE, engine, how='append')