Copyright 2023 Province of British Columbia

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at 

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [None]:
# add our stuff to the path
import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# other stuff
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# import our stuff
from importlib import reload
from src import connect, preprocess, synthetic, model

In [None]:
# PARAMETERS

# info to gain access to database, IDIR restricted 
CRED_PATH = '../credentials.txt'

# where model is stored. requires credentials.txt for full path 
MODEL_BASE_PATH = 'Model/Q22'

# which tables to access
RESPONSE_TABLE = 'dbo.AQ22ANCES'
RESULTS_TABLE = 'dbo.AQ22ANCES_RESULTS'

# which column to use/create 
RESPONSE_COLUMN = 'aq22_cleaned'
OUTPUT_COLUMNS = 'q22ances_c'
N_COLUMNS = 5

# delimiter to send back with concatenated results
DELIMITER = 'μ' 

# threshold for accepting as a flagged category
THRESHOLD = 0.5

# upper and lower thresholds for flagging as a possible category
TENTATIVE_UPPER = 0.75
TENTATIVE_LOWER = 0.25

In [None]:
clf, code_df_long = model.load_model(CRED_PATH, MODEL_BASE_PATH)

In [None]:
from countryinfo import CountryInfo

In [None]:
country = 'Ukraine'
data= CountryInfo(country).info()
data.get('altSpellings', [])

In [None]:
sentence = 'Scotland, Britain, Ireland and Canada'
model.list_classes(sentence, code_df_long, clf, truncate_inputs=True, min_pct = 0)

## Full Model Pipeline

1. Read in data from database (IDIR restricted)
2. Load in model (from LAN)
3. Preprocess data (code stored on GitHub)
    
    * Limit to cycle of interest
    * Lower case and cleaned spelling where possible
    * Turn responses into word scores (how close is each response to every word in the vocabulary code base). <br>
    <br>

4. Create predictions based on word scores
5. Re-incorprate multiple-choice responses
6. Add flag(s) for unusual model outputs
    
    * Was no category predicted
    * Are there model outputs in an 'iffy' probability range <br>
    <br>

7. Send results back to database (IDIR restricted)

In [None]:
# FULL MODEL PIPELINE

cycle = 2

######################
#                    #
#    READ IN DATA    #
#                    #
######################
print(f'Reading in data for cycle {cycle}... ', end = '')

connection = connect.create_connection(CRED_PATH)
df_open = connect.fetch_table(RESPONSE_TABLE, connection)

print('Done.')

######################
#                    #
#    LOAD MODEL      #
#                    #
######################
print('Loading model from file... ', end = '')

clf, code_df_long = model.load_model(CRED_PATH, MODEL_BASE_PATH)
print('Done.')

######################
#                    #
#  PREPROCESS DATA   #
#                    #
######################
print('Preprocessing data... ', end = '')

# Reshaping the dataframe
df_reshaped = preprocess.reshape_df(df_open)

# get cycle
df = df_reshaped[df_reshaped.cycle == cycle].reset_index(drop=True)

# clean column
# Use the cleaned column if available; otherwise, use the original column
df.loc[:, RESPONSE_COLUMN] = df.apply(
    lambda x: str(x.aq22ances).lower().strip(' ') if x.aq22ances is not None else np.nan, 
    axis=1
)

# Now only apply correct_spelling on non-null values
mask = df[RESPONSE_COLUMN].notnull()
df.loc[mask, RESPONSE_COLUMN] = df.loc[mask, RESPONSE_COLUMN].apply(preprocess.correct_spelling)
df[RESPONSE_COLUMN] = df[RESPONSE_COLUMN].astype(str)
print('Done.')

print('Creating model inputs... ', end = '')
# inputs to model
headers = list(preprocess.get_scores('test', code_df_long, as_df = True).col_id.values)
input_df = preprocess.get_scores_from_df(df, RESPONSE_COLUMN, code_df_long, headers=headers)
input_df = preprocess.convert_input(input_df)
print('Done.')

# outputs of model (for selected responses to be included)
print('Creating selected response outputs... ', end = '')
output_df = preprocess.get_outputs_wide(df, RESPONSE_COLUMN, code_df_long, OUTPUT_COLUMNS, N_COLUMNS)
print('Done.')

######################
#                    #
#     RUN MODEL      #
#                    #
######################
print('Running model and extracting results... ')
results_df = model.produce_results(
    df, input_df, output_df, 
    clf,
    OUTPUT_COLUMNS,
    N_COLUMNS,
    question = 'Q22',
    threshold=THRESHOLD,
    tentative_lower = TENTATIVE_LOWER,
    tentative_upper = TENTATIVE_UPPER,
    delimiter = DELIMITER
)
print('\nDone.')

######################
#                    #
#    SAVE RESULTS    #
#                    #
######################
print(f'Sending results to table {RESULTS_TABLE}... ', end = '')
# save back to database
engine = connect.create_connection(CRED_PATH, sqlalchemy=True)
connect.save_table(results_df, RESULTS_TABLE, engine, how='append') # be careful when appending that you aren't doubling data
print('Done')