In [None]:
# add our stuff to the path
import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# other stuff
import time

# import our stuff
from importlib import reload
from src import connect, preprocess, synthetic, model

In [None]:
# parameters
CRED_PATH = '../credentials.txt'

RESPONSE_TABLE = 'dbo.AQ32RACE'
CODE_TABLE = 'dbo.AQ32RACE_Codes'
CLOSED_TABLE = 'dbo.Q32RACEMultiResponse'
RESULTS_TABLE = 'AQ32RACE_RESULTS'

RESPONSE_COLUMN = 'aq32race_combined'

N_SYNTHETIC_MIXED = 50_000

DELIMITER = 'μ' 
THRESHOLD = 0.5
TENTATIVE_UPPER = 0.75
TENTATIVE_LOWER = 0.25

MODEL_FILE_PATH = '../models/Q32'

In [None]:
# Read in all data required to build model
connection = connect.create_connection(CRED_PATH)

# actual responses
df_open = connect.fetch_table(RESPONSE_TABLE, connection)

# codes to match
code_df = connect.fetch_table(CODE_TABLE, connection)

# closed respones (for multi response frequencies)
df_closed = connect.fetch_table(CLOSED_TABLE, connection)

In [None]:
# for training purposes, open responses should only be those from cycle 1
df = df_open[df_open.cycle == 1].reset_index(drop=True)
df.head()

In [None]:
# get a cleaned up column to use (mix of actual comment column and cleaned)
# question : will the cleaned column always be available? or add in extra preprocessing here
df.loc[:, RESPONSE_COLUMN] = df.apply(
    lambda x: x.aq32race.lower() if x.aq32race_cleaned == None or x.aq32race_cleaned=='105' else x.aq32race_cleaned.lower(), 
    axis=1
)

# fix the spelling to send to the model 
# note this takes a long time to do, so don't add it to the get_scores function
df.loc[:, RESPONSE_COLUMN] = df[RESPONSE_COLUMN].apply(preprocess.correct_spelling)

df.head()

In [None]:
# get long form table of codes
code_df_long = preprocess.get_long_form_codes(code_df)
code_df_long.head()

In [None]:
# build a training dataset for the model 
# first part of dataset: actual data

# INPUTS TO MODEL
headers = list(preprocess.get_scores('test', code_df_long, as_df = True).col_id.values)

input_df = preprocess.get_scores_from_df(df, RESPONSE_COLUMN, code_df_long, headers=headers)
display(input_df.head())

# OUTPUTS OF MODEL
# converts the coded columns into wide form 1/0 binary responses for every option 
output_df = preprocess.get_outputs_wide(df, RESPONSE_COLUMN, code_df_long)
output_df.head()

In [None]:
# create synthetic data
# this section will create synthetic data that matches a single category based on available phrases 
input_columns = list(input_df.columns)
output_columns = list(output_df.columns)

extra_input_df, extra_output_df = synthetic.create_single_phrase_synthetic(
    output_df, 
    input_columns,
    output_columns,
    code_df_long
)

In [None]:
# create synthetic data
# this section will create synthetic data that matches multiple categories
mixed_input_df, mixed_output_df = synthetic.create_multi_phrase_synthetic(
    output_df,
    df_closed,
    input_columns,
    output_columns,
    code_df_long,
    N_SYNTHETIC_MIXED
)

In [None]:
# Concatenate with existing data
final_input_df = pd.concat([input_df, extra_input_df, mixed_input_df], ignore_index=True).drop('response', axis=1).astype(int)
final_output_df = pd.concat([output_df, extra_output_df, mixed_output_df], ignore_index=True).drop('response', axis=1).astype(int)

In [None]:
clf = model.create_model(final_input_df, final_output_df)

In [None]:
model.save_model(MODEL_FILE_PATH, clf, code_df_long)

In [None]:
clf, code_df_long = model.load_model(MODEL_FILE_PATH)

In [None]:
sentence = 'chinese, japanese, korean'
model.list_classes(sentence, code_df_long, clf, spellcheck=True)

In [None]:
results_df = model.produce_results(
    df, input_df, output_df, 
    clf,
    threshold=THRESHOLD,
    tentative_lower = TENTATIVE_LOWER,
    tentative_upper = TENTATIVE_UPPER,
    delimiter = DELIMITER
)

In [None]:
# save back to database
engine = connect.create_connection(CRED_PATH, sqlalchemy=True)

# for initial save of cycle 1, always replace. any subsequent inputs should be appended
connect.save_table(results_df, RESULTS_TABLE, engine, how='replace')