Copyright 2023 Province of British Columbia

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at 

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under he License.
the License.

In [None]:
import pandas as pd
from autocorrect import Speller

# add our stuff to the path
import sys
import os
import re
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# import our stuff
from importlib import reload
from src import matching, connect

import warnings
warnings.simplefilter(action='ignore')

In [None]:
# PARAMETERS

# info to gain access to database, IDIR restricted 
CRED_PATH = '../credentials.txt'

# which tables to access
RESPONSE_TABLE = 'dbo.AQ34CULTURE'
CODE_TABLE = 'dbo.AQ34CULTURE_Codes'
RESULTS_TABLE = 'dbo.AQ34CULTURE_RESULTS' # this gets sent back under my idir
MASTER_RESULTS_TABLE = 'dbo.AQ34CULTURE_RESULTS_DONE'

## Full Model Pipeline

1. Read in data from database (IDIR restricted)
2. Filter to only the new IDs
3. Clean html codes, remove trailing spaces
4. Do a basic spell check, but include unusual words from our word list to make sure they don't incorrectly change
5. Translate a subset of the responses - this takes a long time so only those that start with & (and are thus most likely a different language) are translated.
6. Determine if the translated response is an exact or partial match to any of the codes in the code list
7. For partial responses, remove as a possibility if it is just a subset of an exact response
8. Tabulate all information for each response
9. Get list of most frequent un-coded words
10. Send results back to database (IDIR restricted)

In [None]:
# Full Model Pipeline

######################
#                    #
#    READ IN DATA    #
#                    #
######################

print(f'Reading in data... ', end = '')

# Read in all data required to build model
connection = connect.create_connection(CRED_PATH)
# actual responses
df_open = connect.fetch_table(RESPONSE_TABLE, connection)
# responses that have been done already
df_done = connect.fetch_table(MASTER_RESULTS_TABLE, connection)
# codes to match
df_codes = connect.fetch_table(CODE_TABLE, connection)

print('Done.')

######################
#                    #
#   GET DATA READY   #
#                    #
######################

print('Filtering to new IDs...', end = '')
# grab only the IDs of current interest
completed_ids = df_done.id.unique()
df = df_open[~df_open.id.isin(completed_ids)].reset_index(drop=True)

# reshape df to be one row per response, then drop all empty rows 
df = matching.melt_df(df)
print('Done.')


print(f'Getting code list...', end = '')
# convert codes to strings
df_codes.q_code = df_codes.q_code.astype(str)
# pull out all variants of codes 
df_codes_updated = matching.update_codes_culture(df_codes)
code_list = df_codes_updated.qc_desc.values
print('Done.')

print('Creating spell checker...', end = '')
# create a spell checker that includes all code words so they don't get mistakenly corrected
# add additional words that are not inaccurate 
words = [
    'Salish', 
    'Collectivist', 
    'Konger', 
    'Kong', 
    'Slavic', 
    'Colonizer', 
    'Tagalog', 
    'Syilx', 
    'Panasian', 
    'Malayali', 
    'Jatt', 
    'Metis',
    'Romani',
    'Slovak',
    'Ilocano'
]
spell = matching.create_speller(code_list, words)
print('Done.')

######################
#                    #
#   TRANSFORM DATA   #
#                    #
######################

print('Transforming data...')

clean_df = matching.transform_cultures(df, df_codes_updated, spell)

print()
print('Done.')
print()
matching.print_stats(clean_df)
print()

print('Getting un-used word frequencies...')
leftover = clean_df[~clean_df.exact_match & ~clean_df.partial_match].translated.values
frequencies = matching.tokenize_and_count_word_frequencies(leftover)
df_freq = pd.DataFrame(data = [frequencies.keys(), frequencies.values()], index = ['word', 'frequency']).T
df_freq = df_freq.sort_values(by='frequency', ascending=False)
display(df_freq.head(10))
print('Done.')


######################
#                    #
#    SAVE RESULTS    #
#                    #
######################

print(f'Sending results to table {RESULTS_TABLE}... ')
# send back to warehouse 
engine = connect.create_connection(CRED_PATH, sqlalchemy=True)
connect.save_table(clean_df, RESULTS_TABLE, engine, how='replace')

# send frequencies back for possible inclusion into codes
print(f'Sending results to table dbo.AQ34CULTURE_FREQUENCIES... ')
engine = connect.create_connection(CRED_PATH, sqlalchemy=True)
connect.save_table(df_freq, 'dbo.AQ34CULTURE_FREQUENCIES', engine, how='replace')
print('Done.')
