Copyright 2023 Province of British Columbia

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at 

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [None]:
# system stuff
import re
import os
import random

# connection stuff
import pyodbc

# standard stuff
import pandas as pd
import numpy as np

# nlp stuff
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# ml stuff
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

keras.utils.set_random_seed(42)

In [None]:
cred_path = '../credentials.txt'

connection_str = ''
with open(cred_path) as infile:
    for line in infile:
        connection_str += line.strip('\n')

In [None]:
connection = pyodbc.connect(connection_str)

In [None]:
# read in data
df = pd.read_sql(
    'SELECT * FROM dbo.AQ32RACE WHERE Cycle=1', 
    connection
)

code_df = pd.read_sql(
    'SELECT * FROM dbo.AQ32RACE_Codes', 
    connection
)

In [None]:
# clean data headers 
def clean_headers(df):
    df.columns = [x.lower().replace(' ','_') for x in df.columns]



In [None]:
clean_headers(df)
clean_headers(code_df)

In [None]:
# get a cleaned up column to use (mix of actual comment column and cleaned)
df['aq32race_combined'] = df.apply(
    lambda x: x.aq32race.lower() if x.aq32race_cleaned == None or x.aq32race_cleaned=='105' else x.aq32race_cleaned.lower(), 
    axis=1
)

df.head()

In [None]:
def split_description(description):

    # check for NULLs
    if description is None:
        return []
        
    # split string based on comma delimiters, as well as words in brackets
    desc_list = re.split(r'\sand\s|\sor\s|[,()\r\n]+', description)

    # lower case, remove extra characters and remove spaces
    desc_list = [x.lower().replace('"', '').replace('_', '').strip(' ') for x in desc_list]

    # remove descriptors that are empty
    desc_list = [x for x in desc_list if x!='']

    return desc_list

In [None]:
# long form of all possible descriptors used

code_dict_long = { 'code': [], 'description': [] }

for idx, row in code_df.iterrows():
    code = row.q_code

    qc_desc = split_description(row.qc_desc)
    qc_desc_notes = split_description(row.qc_desc_notes)
    additional_notes = split_description(row.additional_notes)

    all_desc = qc_desc + qc_desc_notes + additional_notes

    # remove duplicates 
    all_desc = [*set(all_desc)]
    
    n_desc = len(all_desc)

    if n_desc==0:
        continue

    # append to dictionary
    code_dict_long['code'].extend([code]*n_desc)
    code_dict_long['description'].extend(all_desc)

code_df_long = pd.DataFrame(code_dict_long)



In [None]:
code_df_long[code_df_long.code=='105']

In [None]:
code_df_long[code_df_long.description.str.contains('/')]

In [None]:
code_df_long

In [None]:
# create testing df
# converts the coded columns into wide form 1/0 binary responses for every option 
code_list = code_df_long.code.unique()
output_length = len(code_list)

test_df = pd.DataFrame(columns = ['response'] + list(code_list))

for idx, row in df.iterrows():
    response = row.aq32race_combined
    code_vals = [0]*len(code_list)
    for ii in range(1,17):
        column = f'q32race_c{ii:02}'
        possible_code = row[column]
        if possible_code is None:
            continue
        else:
            idx_option = np.where(code_list==possible_code)[0]
            if len(idx_option)>0:
                code_vals[idx_option[0]] = 1

    tmp_df = pd.DataFrame(np.array([response] + code_vals).reshape(1, -1), columns = ['response'] + list(code_list))
    test_df = pd.concat([test_df, tmp_df]).reset_index(drop=True)

test_df.iloc[:, 1:] = test_df.iloc[:, 1:].astype(int)
test_df

In [None]:
# create synthetic data
# this section will create snythetic data that matches a single category based on available phrases 
code_counts = df.q32race_c01.value_counts()
max_counts = code_counts.values[0]
extra_test_df = pd.DataFrame(columns = test_df.columns)

for idx, val in code_counts.items():
    print()
    print_string = f'Code: {idx} -- Observations: {val}'
    print(print_string, end='\r')

    # don't add any more to biggest class 
    if val == max_counts:
        continue
    else:
        if idx=='Human':
            continue
        idx = idx.strip(' ')
        # find all words associated with that index
        desc_list = code_df_long[code_df_long.code==idx].description.values
        code_vals = [0]*len(code_list)
        code_idx = np.where(code_list==idx)[0]
        if len(code_idx) == 0:
            continue
            
        code_vals[code_idx[0]] = 1

        n_more_counts = max_counts - val

        # create extra responses for each category
        for ii in range(n_more_counts):
            print_string = f'Code: {idx} -- Observations: {val} + {ii:04}'
            print(print_string, end='\r')

            # choose from list at random - should choose uniformly from options 
            description = random.choice(desc_list)
            tmp_test = pd.DataFrame(np.array([response] + code_vals).reshape(1, -1), columns = test_df.columns)

            tmp_test.iloc[:, 1:] = tmp_test.iloc[:, 1:].astype(int)

            extra_test_df = pd.concat([extra_test_df, tmp_test])

        print_string = f'Code: {idx} -- Observations: {val} + {ii:04}. Done.'
        print(print_string, end='\r')
            

In [None]:
extra_test_df.head()

In [None]:
train_with_copies_y = pd.concat([test_df, extra_test_df]).drop('response', axis=1).astype(int)

In [None]:
train_with_copies_x.iloc[0, 4]

In [None]:
# keras playground
test = []
for idx, row in df.iterrows():
    response = row.aq32race_combined
    words = response.lower().split(' ')
    for word in words:
        word = word.strip(' ')
        if word == '':
            continue
        if word not in test:
            test.append(word)

len(test)

In [None]:
# hyperparameters
BATCH_SIZE = 64
EPOCHS = 3
MAX_SEQUENCE_LENGTH = 256 # actual max 216
VOCAB_SIZE = 15000

EMBED_DIM = 128
INTERMEDIATE_DIM = 512

reserved_tokens = ["[PAD]", "[UNK]"]

In [None]:
# tokenize data
def train_word_piece(ds, vocab_size, reserved_tokens):
    word_piece_ds = ds.unbatch().map(lambda x, y: x)
    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
        word_piece_ds.batch(1000).prefetch(2),
        vocabulary_size=vocab_size,
        reserved_tokens=reserved_tokens,
    )
    return vocab

In [None]:
df_with_copies = pd.concat([test_df, extra_test_df])

In [None]:
X = tf.data.Dataset.from_tensor_slices(df_with_copies['response'].values)
Y = tf.data.Dataset.from_tensor_slices(df_with_copies.drop('response', axis=1).values.astype(int))

In [None]:
Y

In [None]:
X

In [None]:
ds = tf.data.Dataset.zip((X, Y))
ds = ds.batch(BATCH_SIZE, drop_remainder=False)
ds

In [None]:
for text_batch, label_batch in ds.take(1):
    for i in range(3):
        print(text_batch.numpy()[i])
        print(label_batch.numpy()[i])

In [None]:
for a, b in ds:
    print(a.shape, b.shape)

In [None]:
vocab = train_word_piece(ds, VOCAB_SIZE, reserved_tokens)

In [None]:
len(vocab)

In [None]:
vocab[100:110]

In [None]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    lowercase=False,
    sequence_length=MAX_SEQUENCE_LENGTH
)

In [None]:
input_sentence_ex = ds.take(1).get_single_element()[0][0]
input_tokens_ex = tokenizer(input_sentence_ex)

print("Sentence: ", input_sentence_ex)
print("Tokens: ", input_tokens_ex)
print("Recovered text after detokenizing: ", tokenizer.detokenize(input_tokens_ex))


In [None]:
ds

In [None]:
# format dataset
def format_dataset(sentence, label):
    sentence = tokenizer(sentence)
    return (sentence, label)

def make_dataset(dataset):
    dataset = dataset.map(format_dataset, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset

In [None]:
train_ds = make_dataset(ds)

In [None]:
train_ds.take(1).get_single_element()[0]

In [None]:
# build model
input_ids = keras.Input(shape=(None, ), dtype="int64")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(input_ids)

x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)

x = keras.layers.GlobalAveragePooling1D()(x)
x = keras.layers.Dropout(0.1)(x)
outputs = keras.layers.Dense(output_length, activation="sigmoid")(x)

fnet_classifier = keras.Model(input_ids, outputs, name="fnet_classifier")

In [None]:
fnet_classifier.summary()

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print(tf.config.list_physical_devices('GPU'))

In [None]:
fnet_classifier.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

In [None]:
fnet_classifier.fit(train_ds, epochs=EPOCHS)

In [None]:
out = fnet_classifier.predict(train_ds)

In [None]:
out

In [None]:
test_df.head().values[0, :]

In [None]:
code_list[17]

In [None]:
test_input = tokenizer(['canadian, french, afro-american'])
test_input

In [None]:
test_out = fnet_classifier.predict(test_input)

In [None]:
np.argsort(test_out[0])

In [None]:
def list_classes_fnet(sentence, code_list, code_df, top_n = 10):

    test_input = tokenizer([sentence])
    test_out = fnet_classifier.predict(test_input)

    predictions = test_out[0]
    ordered_idx = np.argsort(predictions)[::-1]
    print()
    print(f'TOP MATCHES FOR: {sentence}')
    print()
    for counter, idx in enumerate(ordered_idx):
        if counter>=top_n:
            break
        else:
            prob = predictions[idx]
            code = code_list[idx]
            desc = code_df.loc[code_df['q_code'] == code, 'qc_desc'].values[0]
            print(f'{prob:0.2%}')
            print(desc)
            print()

In [None]:
sentence = 'afro-canadian'
list_classes_fnet(sentence, code_list, code_df)