In [None]:
# add our stuff to the path
import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# other stuff
import time

# import our stuff
from importlib import reload
from src import connect, preprocess, synthetic, model

In [None]:
# parameters
CRED_PATH = '../credentials.txt'

RESPONSE_TABLE = 'dbo.AQ32RACE'
RESULTS_TABLE = 'AQ32RACE_RESULTS'

RESPONSE_COLUMN = 'aq32race_combined'

N_SYNTHETIC_MIXED = 50_000

DELIMITER = 'Î¼' 
THRESHOLD = 0.5
TENTATIVE_UPPER = 0.75
TENTATIVE_LOWER = 0.25

MODEL_FILE_PATH = '../models/Q32'

In [None]:
# FULL MODEL PIPELINE

cycle = 2

######################
#                    #
#    READ IN DATA    #
#                    #
######################
print(f'Reading in data for cycle {cycle}... ', end = '')

connection = connect.create_connection(CRED_PATH)
df_open = connect.fetch_table(RESPONSE_TABLE, connection)

print('Done.')

######################
#                    #
#    LOAD MODEL      #
#                    #
######################
print('Loading model from file... ', end = '')

clf, code_df_long = model.load_model(MODEL_FILE_PATH)
print('Done.')

######################
#                    #
#  PREPROCESS DATA   #
#                    #
######################
print('Preprocessing data... ', end = '')

# get cycle
df = df_open[df_open.cycle == cycle].reset_index(drop=True)

# clean column
df.loc[:, RESPONSE_COLUMN] = df.apply(
    lambda x: x.aq32race.lower() if x.aq32race_cleaned == None or x.aq32race_cleaned=='105' else x.aq32race_cleaned.lower(), 
    axis=1
)

# fix spelling
df.loc[:, RESPONSE_COLUMN] = df[RESPONSE_COLUMN].apply(preprocess.correct_spelling)
print('Done.')

print('Creating model inputs... ', end = '')
# inputs to model
headers = list(preprocess.get_scores('test', code_df_long, as_df = True).col_id.values)
input_df = preprocess.get_scores_from_df(df, RESPONSE_COLUMN, code_df_long, headers=headers)
print('Done.')

# outputs of model (for selected responses to be included)
print('Creating selected response outputs... ', end = '')
output_df = preprocess.get_outputs_wide(df, RESPONSE_COLUMN, code_df_long)
print('Done.')

######################
#                    #
#     RUN MODEL      #
#                    #
######################
print('Running model and extracting results... ')
results_df = model.produce_results(
    df, input_df, output_df, 
    clf,
    threshold=THRESHOLD,
    tentative_lower = TENTATIVE_LOWER,
    tentative_upper = TENTATIVE_UPPER,
    delimiter = DELIMITER
)
print('\nDone.')

######################
#                    #
#    SAVE RESULTS    #
#                    #
######################
print(f'Sending results to table {RESULTS_TABLE}... ', end = '')
# save back to database
engine = connect.create_connection(CRED_PATH, sqlalchemy=True)
connect.save_table(results_df, RESULTS_TABLE, engine, how='append') # be careful when appending that you aren't doubling data
print('Done')