Copyright 2023 Province of British Columbia

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at 

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [1]:
# system stuff
import re
import os
import random

# connection stuff
import pyodbc

# standard stuff
import pandas as pd
import numpy as np

# nlp stuff
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# ml stuff
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import keras
keras.utils.set_random_seed(42)

In [2]:
cred_path = '../../credentials.txt'

connection_str = ''
with open(cred_path) as infile:
    for line in infile:
        connection_str += line.strip('\n')

In [3]:
connection = pyodbc.connect(connection_str)

In [4]:
# read in data
df = pd.read_sql(
    'SELECT * FROM dbo.AQ32RACE WHERE Cycle=1', 
    connection
)

code_df = pd.read_sql(
    'SELECT * FROM dbo.AQ32RACE_Codes', 
    connection
)

  df = pd.read_sql(
  code_df = pd.read_sql(


In [5]:
# clean data headers 
def clean_headers(df):
    df.columns = [x.lower().replace(' ','_') for x in df.columns]



In [6]:
clean_headers(df)
clean_headers(code_df)

In [7]:
# get a cleaned up column to use (mix of actual comment column and cleaned)
df['aq32race_combined'] = df.apply(
    lambda x: x.aq32race.lower() if x.aq32race_cleaned == None or x.aq32race_cleaned=='105' else x.aq32race_cleaned.lower(), 
    axis=1
)

df.head()

Unnamed: 0,id,q32race,aq32race,aq32race_cleaned,coding_comment,q32race_c01,q32race_c02,q32race_c03,q32race_c04,q32race_c05,...,q32race_c09,q32race_c10,q32race_c11,q32race_c12,q32race_c13,q32race_c14,q32race_c15,q32race_c16,cycle,aq32race_combined
0,37231,105µ97,Second generation Canadian,,,105,97.0,,,,...,,,,,,,,,1,second generation canadian
1,37247,97,Caucasion,,,97,,,,,...,,,,,,,,,1,caucasion
2,37261,97,Canadian,,,97,,,,,...,,,,,,,,,1,canadian
3,37282,105µ97,Brasileiro,,,105,97.0,,,,...,,,,,,,,,1,brasileiro
4,37287,97,Canadian- Spanish,,,97,,,,,...,,,,,,,,,1,canadian- spanish


In [8]:
def split_description(description):

    # check for NULLs
    if description is None:
        return []
        
    # split string based on comma delimiters, as well as words in brackets
    desc_list = re.split(r'\sand\s|\sor\s|[,()\r\n]+', description)

    # lower case, remove extra characters and remove spaces
    desc_list = [x.lower().replace('"', '').replace('_', '').strip(' ') for x in desc_list]

    # remove descriptors that are empty
    desc_list = [x for x in desc_list if x!='']

    return desc_list

In [9]:
# long form of all possible descriptors used

code_dict_long = { 'code': [], 'description': [] }

for idx, row in code_df.iterrows():
    code = row.q_code

    qc_desc = split_description(row.qc_desc)
    qc_desc_notes = split_description(row.qc_desc_notes)
    additional_notes = split_description(row.additional_notes)

    all_desc = qc_desc + qc_desc_notes + additional_notes

    # remove duplicates 
    all_desc = [*set(all_desc)]
    
    n_desc = len(all_desc)

    if n_desc==0:
        continue

    # append to dictionary
    code_dict_long['code'].extend([code]*n_desc)
    code_dict_long['description'].extend(all_desc)

code_df_long = pd.DataFrame(code_dict_long)



In [10]:
code_df_long[code_df_long.code=='105']

Unnamed: 0,code,description
34,105,uk
35,105,balkan
36,105,french
37,105,australian
38,105,anglo-saxon
39,105,eastern european
40,105,irish
41,105,italian
42,105,sapmi
43,105,western european


In [11]:
code_df_long[code_df_long.description.str.contains('/')]

Unnamed: 0,code,description
157,88,i don't know/ i am unsure


In [12]:
code_df_long

Unnamed: 0,code,description
0,10000,indeterminate
1,101,afro-canadian
2,101,jamaican
3,101,african
4,101,nigerian
...,...,...
187,99,none of the above
188,99,prefer not to answer
189,99,i don't have a race
190,99,comment


In [13]:
# create testing df
# converts the coded columns into wide form 1/0 binary responses for every option 
code_list = code_df_long.code.unique()
output_length = len(code_list)

test_df = pd.DataFrame(columns = ['response'] + list(code_list))

for idx, row in df.iterrows():
    response = row.aq32race_combined
    code_vals = [0]*len(code_list)
    for ii in range(1,17):
        column = f'q32race_c{ii:02}'
        possible_code = row[column]
        if possible_code is None:
            continue
        else:
            idx_option = np.where(code_list==possible_code)[0]
            if len(idx_option)>0:
                code_vals[idx_option[0]] = 1

    tmp_df = pd.DataFrame(np.array([response] + code_vals).reshape(1, -1), columns = ['response'] + list(code_list))
    test_df = pd.concat([test_df, tmp_df]).reset_index(drop=True)

test_df.iloc[:, 1:] = test_df.iloc[:, 1:].astype(int)
test_df

Unnamed: 0,response,10000,101,102,103,1041,1042,1043,1044,105,...,90004,90005,90006,90007,90008,90009,90010,90011,97,99
0,second generation canadian,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,caucasion,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,canadian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,brasileiro,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,canadian- spanish,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6626,sri lankan burgher,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
6627,canadian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6628,second generation canadian,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
6629,english,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [14]:
# create synthetic data
# this section will create snythetic data that matches a single category based on available phrases 
code_counts = df.q32race_c01.value_counts()
max_counts = code_counts.values[0]
extra_test_df = pd.DataFrame(columns = test_df.columns)

for idx, val in code_counts.items():
    print()
    print_string = f'Code: {idx} -- Observations: {val}'
    print(print_string, end='\r')

    # don't add any more to biggest class 
    if val == max_counts:
        continue
    else:
        if idx=='Human':
            continue
        idx = idx.strip(' ')
        # find all words associated with that index
        desc_list = code_df_long[code_df_long.code==idx].description.values
        code_vals = [0]*len(code_list)
        code_idx = np.where(code_list==idx)[0]
        if len(code_idx) == 0:
            continue
            
        code_vals[code_idx[0]] = 1

        n_more_counts = max_counts - val

        # create extra responses for each category
        for ii in range(n_more_counts):
            print_string = f'Code: {idx} -- Observations: {val} + {ii:04}'
            print(print_string, end='\r')

            # choose from list at random - should choose uniformly from options 
            description = random.choice(desc_list)
            tmp_test = pd.DataFrame(np.array([response] + code_vals).reshape(1, -1), columns = test_df.columns)

            tmp_test.iloc[:, 1:] = tmp_test.iloc[:, 1:].astype(int)

            extra_test_df = pd.concat([extra_test_df, tmp_test])

        print_string = f'Code: {idx} -- Observations: {val} + {ii:04}. Done.'
        print(print_string, end='\r')
            


Code: 105 -- Observations: 2082
Code: 20001 -- Observations: 2055 + 0026. Done.
Code: 97 -- Observations: 1099 + 0982. Done.
Code: 110 -- Observations: 254 + 1827. Done.
Code: 1044 -- Observations: 181 + 1900. Done.
Code: 112 -- Observations: 141 + 1940. Done.
Code: 1041 -- Observations: 72 + 2009. Done.
Code: 30000 -- Observations: 71 + 2010. Done.
Code: 60000 -- Observations: 65 + 2016. Done.
Code: 10000 -- Observations: 61 + 2020. Done.
Code: 101 -- Observations: 52 + 2029. Done.
Code: 106 -- Observations: 51 + 2030. Done.
Code: 1112 -- Observations: 44 + 2037. Done.
Code: 108 -- Observations: 44 + 2037. Done.
Code: 40000 -- Observations: 43 + 2038. Done.
Code: 99 -- Observations: 41 + 2040. Done.
Code: 20000 -- Observations: 34 + 2047. Done.
Code: 20002 -- Observations: 34 + 2047. Done.
Code: 109 -- Observations: 28 + 2053. Done.
Code: 102 -- Observations: 22 + 2059. Done.
Code: 1042 -- Observations: 22 + 2059. Done.
Code: 1111 -- Observations: 17 + 2064. Done.
Code: 103 -- Observ

In [15]:
extra_test_df.head()

Unnamed: 0,response,10000,101,102,103,1041,1042,1043,1044,105,...,90004,90005,90006,90007,90008,90009,90010,90011,97,99
0,canadian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,canadian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,canadian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,canadian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,canadian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
train_with_copies_y = pd.concat([test_df, extra_test_df]).drop('response', axis=1).astype(int)

In [17]:
train_with_copies_x.iloc[0, 4]

NameError: name 'train_with_copies_x' is not defined

In [18]:
# keras playground
test = []
for idx, row in df.iterrows():
    response = row.aq32race_combined
    words = response.lower().split(' ')
    for word in words:
        word = word.strip(' ')
        if word == '':
            continue
        if word not in test:
            test.append(word)

len(test)

2268

In [19]:
# hyperparameters
BATCH_SIZE = 64
EPOCHS = 3
MAX_SEQUENCE_LENGTH = 256 # actual max 216
VOCAB_SIZE = 15000

EMBED_DIM = 128
INTERMEDIATE_DIM = 512

reserved_tokens = ["[PAD]", "[UNK]"]

In [31]:
import keras_nlp

Using TensorFlow backend


In [32]:
# tokenize data
def train_word_piece(ds, vocab_size, reserved_tokens):
    word_piece_ds = ds.unbatch().map(lambda x, y: x)
    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
        word_piece_ds.batch(1000).prefetch(2),
        vocabulary_size=vocab_size,
        reserved_tokens=reserved_tokens,
    )
    return vocab

In [21]:
df_with_copies = pd.concat([test_df, extra_test_df])

In [23]:
import tensorflow as tf

In [24]:
X = tf.data.Dataset.from_tensor_slices(df_with_copies['response'].values)
Y = tf.data.Dataset.from_tensor_slices(df_with_copies.drop('response', axis=1).values.astype(int))

In [25]:
Y

<TensorSliceDataset element_spec=TensorSpec(shape=(61,), dtype=tf.int32, name=None)>

In [26]:
X

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [27]:
ds = tf.data.Dataset.zip((X, Y))
ds = ds.batch(BATCH_SIZE, drop_remainder=False)
ds

<BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 61), dtype=tf.int32, name=None))>

In [28]:
for text_batch, label_batch in ds.take(1):
    for i in range(3):
        print(text_batch.numpy()[i])
        print(label_batch.numpy()[i])

b'second generation canadian'
[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
b'caucasion'
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
b'canadian'
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]


In [29]:
for a, b in ds:
    print(a.shape, b.shape)

(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64, 61)
(64,) (64,

In [33]:
vocab = train_word_piece(ds, VOCAB_SIZE, reserved_tokens)

In [34]:
len(vocab)

380

In [35]:
vocab[100:110]

['anglo',
 'hong',
 'iranian',
 '##ic',
 '##ing',
 'but',
 '##ed',
 '##a',
 'canada',
 '##o']

In [36]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    lowercase=False,
    sequence_length=MAX_SEQUENCE_LENGTH
)

In [37]:
input_sentence_ex = ds.take(1).get_single_element()[0][0]
input_tokens_ex = tokenizer(input_sentence_ex)

print("Sentence: ", input_sentence_ex)
print("Tokens: ", input_tokens_ex)
print("Recovered text after detokenizing: ", tokenizer.detokenize(input_tokens_ex))


Sentence:  tf.Tensor(b'second generation canadian', shape=(), dtype=string)
Tokens:  tf.Tensor(
[ 46  68 281 332  88 122  58   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0

In [38]:
ds

<BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 61), dtype=tf.int32, name=None))>

In [39]:
# format dataset
def format_dataset(sentence, label):
    sentence = tokenizer(sentence)
    return (sentence, label)

def make_dataset(dataset):
    dataset = dataset.map(format_dataset, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset

In [40]:
train_ds = make_dataset(ds)

In [41]:
train_ds.take(1).get_single_element()[0]

<tf.Tensor: shape=(64, 256), dtype=int32, numpy=
array([[ 46,  68, 281, ...,   0,   0,   0],
       [115,   0,   0, ...,   0,   0,   0],
       [ 58,   0,   0, ...,   0,   0,   0],
       ...,
       [ 58,   0,   0, ...,   0,   0,   0],
       [ 58,   0,   0, ...,   0,   0,   0],
       [ 69,  11,  58, ...,   0,   0,   0]])>

In [42]:
# build model
input_ids = keras.Input(shape=(None, ), dtype="int64")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(input_ids)

x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)

x = keras.layers.GlobalAveragePooling1D()(x)
x = keras.layers.Dropout(0.1)(x)
outputs = keras.layers.Dense(output_length, activation="sigmoid")(x)

fnet_classifier = keras.Model(input_ids, outputs, name="fnet_classifier")

In [43]:
fnet_classifier.summary()

Model: "fnet_classifier"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 token_and_position_embeddin  (None, None, 128)        1952768   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 f_net_encoder (FNetEncoder)  (None, None, 128)        132224    
                                                                 
 f_net_encoder_1 (FNetEncode  (None, None, 128)        132224    
 r)                                                              
                                                                 
 f_net_encoder_2 (FNetEncode  (None, None, 128)        132224    
 r)                                                

In [44]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print(tf.config.list_physical_devices('GPU'))

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12166739713216212592
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5836374016
locality {
  bus_id: 1
  links {
  }
}
incarnation: 9350135564655422580
physical_device_desc: "device: 0, name: NVIDIA RTX A2000 8GB Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6"
xla_global_id: 416903419
]
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [45]:
fnet_classifier.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

In [46]:
fnet_classifier.fit(train_ds, epochs=EPOCHS)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1d8b3d32ec0>

In [47]:
out = fnet_classifier.predict(train_ds)



In [48]:
out

array([[0.576226  , 0.49136236, 0.23476495, ..., 0.21153957, 0.786703  ,
        0.5846038 ],
       [0.57925856, 0.4924781 , 0.23093706, ..., 0.21222655, 0.78635716,
        0.5883668 ],
       [0.5778354 , 0.49123242, 0.2312302 , ..., 0.21290135, 0.7852449 ,
        0.58655673],
       ...,
       [0.5778297 , 0.4912404 , 0.23122998, ..., 0.21279384, 0.7852368 ,
        0.5865586 ],
       [0.5778297 , 0.4912404 , 0.23122998, ..., 0.21279384, 0.7852368 ,
        0.5865586 ],
       [0.5778297 , 0.4912404 , 0.23122998, ..., 0.21279384, 0.7852368 ,
        0.5865586 ]], dtype=float32)

In [49]:
test_df.head().values[0, :]

array(['second generation canadian', 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0], dtype=object)

In [50]:
code_list[17]

'20000'

In [51]:
test_input = tokenizer(['canadian, french, afro-american'])
test_input

<tf.Tensor: shape=(1, 256), dtype=int32, numpy=
array([[ 58,   8,  91,   8,  28, 167, 136, 109,   9,  69,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
   

In [52]:
test_out = fnet_classifier.predict(test_input)



In [53]:
np.argsort(test_out[0])

array([38, 36, 26, 56, 31, 33, 37, 39, 18, 57, 29, 28, 42, 24, 58, 40, 41,
       43, 32,  2, 35,  5, 54,  9, 12, 10, 15,  4, 45, 11, 16, 14, 47, 22,
       19, 44, 51,  1, 53,  7, 20, 50, 46, 23, 27, 49,  0,  3, 60, 13,  6,
       48, 21, 17, 25, 52, 55,  8, 59, 34, 30], dtype=int64)

In [54]:
def list_classes_fnet(sentence, code_list, code_df, top_n = 10):

    test_input = tokenizer([sentence])
    test_out = fnet_classifier.predict(test_input)

    predictions = test_out[0]
    ordered_idx = np.argsort(predictions)[::-1]
    print()
    print(f'TOP MATCHES FOR: {sentence}')
    print()
    for counter, idx in enumerate(ordered_idx):
        if counter>=top_n:
            break
        else:
            prob = predictions[idx]
            code = code_list[idx]
            desc = code_df.loc[code_df['q_code'] == code, 'qc_desc'].values[0]
            print(f'{prob:0.2%}')
            print(desc)
            print()

In [55]:
sentence = 'afro-canadian'
list_classes_fnet(sentence, code_list, code_df)


TOP MATCHES FOR: afro-canadian

99.92%
Hawaiian

97.34%
Indigenous American, Native American

79.46%
Prefer to self-describe

78.82%
European

67.29%
Indigenous n.i.e. and n.o.s

66.18%
Australiasian and Australian

64.03%
North American

63.75%
Melanesia

61.89%
Mennonite

60.36%
Middle-Eastern

