Copyright 2023 Province of British Columbia

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at 

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [None]:
# add our stuff to the path
import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# other stuff
from autocorrect import Speller
import pandas as pd
import re
import time
import warnings
warnings.simplefilter(action='ignore')

# import our stuff
from importlib import reload
from src import connect, preprocess, synthetic, model, matching

In [None]:
# PARAMETERS

# info to gain access to database, IDIR restricted 
CRED_PATH = '../credentials.txt'

# where model is stored. requires credentials.txt for full path 
MODEL_BASE_PATH = 'Model/Q22'

# which tables to access
RESPONSE_TABLE = 'dbo.AQ22ANCES'
RESULTS_TABLE = 'AQ22ANCES_RESULTS'
MASTER_RESULTS_TABLE = 'dbo.AQ22ANCES_RESULTS_Done'

# which column to use/create 
RESPONSE_COLUMN = 'aq22_cleaned'
OUTPUT_COLUMNS = 'q22ances_c'
N_COLUMNS = 5

# delimiter to send back with concatenated results
DELIMITER = 'μ' 

# threshold for accepting as a flagged category
THRESHOLD = 0.5

# upper and lower thresholds for flagging as a possible category
TENTATIVE_UPPER = 0.75
TENTATIVE_LOWER = 0.25

## Full Pipeline

1. Read in data from database (IDIR restricted)
2. Load in model (from LAN)
3. Preprocess data (code stored on GitHub)
4. Create predictions based on word scores
5. Re-incorprate multiple-choice responses
7. Pull final values that were actually used after the end of all modeling/QA/manual work was done from database
   
   * note that this whole workflow is required because of the way the data is stored in the database - no longer have access to the model results, only the final post model and manual coding results only for this question.
     
9. Create metrics to compare:

    * how much the model added
    * how much it differed from the final outputs that were used 

In [None]:
# FULL MODEL PIPELINE

######################
#                    #
#    READ IN DATA    #
#                    #
######################
print(f'Reading in data... ', end = '')

connection = connect.create_connection(CRED_PATH)
df_open = connect.fetch_table(RESPONSE_TABLE, connection)
# responses that have been done already
#df_done = connect.fetch_table(MASTER_RESULTS_TABLE, connection)

print('Done.')

######################
#                    #
#    LOAD MODEL      #
#                    #
######################
print('Loading model from file... ', end = '')

clf, code_df_long = model.load_model(CRED_PATH, MODEL_BASE_PATH)
print('Done.')

######################
#                    #
#  PREPROCESS DATA   #
#                    #
######################
print('Preprocessing data... ', end = '')

# get new ids
#completed_ids = df_done.id.unique()
df_filtered = df_open.copy()#[~df_open.id.isin(completed_ids)].reset_index(drop=True)

# reshape dataframe
df = preprocess.reshape_df(df_filtered)

# clean column
# first create the speller
code_list = code_df_long.description.values

# create a spell checker
spell = Speller()

# get a list of all words that are directly in the code words 
all_words = []
for word in code_list:
    words = re.split(r'\sand\s|[,;()/\r\n\s]+', word)
    for x in words:
        if len(x) > 0:
            all_words.append(x)
            
# add additional words that are not inaccurate 
words = [
    'Inuit',
    'Wsanec',
    'Tongo',
    'Levant',
    'Berber',
    'Guinea-Bissau',
    'Guinea',
    'Bissau',
    'Goan',
    'Dessie',
    'Chilean',
    'Burundi',
    'Burmese',
    'Hongkonger',
    'Konger'
]

for word in all_words + words:

    for x in [word, word.upper(), word.lower()]:
        if x in spell.nlp_data:
            continue
            
        spell.nlp_data[word] = 100
        spell.nlp_data[word.upper()] = 100
        spell.nlp_data[word.lower()] = 100


# spellcheck responses 
df[RESPONSE_COLUMN] = df.aq22ances.apply(lambda x: preprocess.correct_spelling(x, spell=spell))
df[RESPONSE_COLUMN] = df[RESPONSE_COLUMN].astype(str)

# translate, only those that start with &
#df.loc[df.aq22ances.str[0] == '&', RESPONSE_COLUMN] = df[df.aq22ances.str[0] == '&'][RESPONSE_COLUMN].apply(lambda x: matching.get_translation(x, skip=False)[0])
print('Done.')

print('Creating model inputs... ', end = '')
# inputs to model
headers = list(preprocess.get_scores('test', code_df_long, as_df = True).col_id.values)
input_df = preprocess.get_scores_from_df(df, RESPONSE_COLUMN, code_df_long, headers=headers)
print('Done.')

# having memory issues so convert to chunks
chunk_size = 1000
chunks_input = [input_df.iloc[i:i+chunk_size] for i in range(0, len(input_df), chunk_size)]
processed_chunks_input = []
n_chunks = len(chunks_input)
print_str = 'Converting chunks... '
print(print_str, end = '\r')
for idx, chunk in enumerate(chunks_input):
    processed_chunk = preprocess.convert_input(chunk)
    processed_chunks_input.append(processed_chunk)
    print_str_2 = print_str + f'{idx+1:02}/{n_chunks}'
    print(print_str_2, end = '\r')
input_df = pd.concat(processed_chunks_input, ignore_index=True)
print(print_str_2 + ' Done.')

# outputs of model (for selected responses to be included)
print('Creating selected response outputs... ', end = '')
output_df = preprocess.get_outputs_wide(df, RESPONSE_COLUMN, code_df_long, OUTPUT_COLUMNS, N_COLUMNS)
print('Done.')

######################
#                    #
#     RUN MODEL      #
#                    #
######################
print('Running model and extracting results... ')
results_df = model.produce_results(
    df, input_df, output_df, 
    clf,
    OUTPUT_COLUMNS,
    N_COLUMNS,
    question = 'Q22',
    threshold=THRESHOLD,
    tentative_lower = TENTATIVE_LOWER,
    tentative_upper = TENTATIVE_UPPER,
    delimiter = DELIMITER
)
print('\nDone.')

######################
#                    #
# GET FINAL RESULTS  #
#                    #
######################
print(f'Getting final results... ', end = '')
connection = connect.create_connection(CRED_PATH)
final_df = connect.fetch_table(MASTER_RESULTS_TABLE, connection)
print('Done')

In [None]:
# some basic model metrics
(
    results_df[['id', 'match', 'original_matched', 'extra_categories', 'n_original_categories', 'n_model_categories']].astype('float')
    .assign(added_categories = lambda x: (x['extra_categories']>0)*1)
    .agg(
        {
            'id': 'count',
            'match': 'sum',
            'original_matched': 'mean',
            'extra_categories': 'mean',
            'n_model_categories': 'mean',
            'added_categories': 'sum'
        }
    )
)

In [None]:
results_df[~pd.isnull(results_df.tentative_categories)].shape[0]

In [None]:
# compare the set of final model categories to the final set of final categories... 
def make_set(row):
    n_max = 5
    full_set = set()
    for ii in range(1,n_max+1):
        col_name = f'q22ances_c{ii:02d}'
        val = row[col_name]
        if val is not None:
            full_set.add(val)
    return full_set

results_df['model_set'] = results_df.apply(make_set, axis=1)
final_df['final_set'] = final_df.apply(make_set, axis=1)

final_df['final_comment'] = final_df['aq22ances']
results_df['model_comment'] = results_df['aq22ances']

In [None]:
test_df = final_df[['id', 'final_set', 'final_comment']].merge(results_df[['id', 'model_set', 'model_comment']], how='left', on='id')
test_df['matches'] = test_df.apply(lambda x: x.final_set == x.model_set, axis=1)

In [None]:
print(f'Total:   {test_df.shape[0]:,}')
print(f'Matched: {test_df.matches.sum():,}')
print(f'Percent: {test_df.matches.sum()/test_df.shape[0]:0.0%}')

In [None]:
test_df[~test_df.matches][['final_comment', 'final_set', 'model_set']]