Copyright 2023 Province of British Columbia

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at 

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [None]:
# system stuff
import re
import os
import random

# connection stuff
from sqlalchemy import create_engine
import pyodbc

# standard stuff
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)

# nlp stuff
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# ml stuff
from sklearn.ensemble import RandomForestClassifier

In [None]:
match = 'afro-canadian'
test = 'canadian'

print(fuzz.ratio(match, test))
print(fuzz.partial_ratio(match, test))
print(fuzz.token_sort_ratio(match, test))
print(fuzz.token_set_ratio(match, test))

In [None]:
cred_path = '../credentials.txt'

connection_str = ''
with open(cred_path) as infile:
    for line in infile:
        connection_str += line.strip('\n')

In [None]:
connection = pyodbc.connect(connection_str)

In [None]:
# read in data

# Actual open responses
df = pd.read_sql(
    'SELECT * FROM dbo.AQ32RACE WHERE Cycle=1', 
    connection
)

# Codes to match to
code_df = pd.read_sql(
    'SELECT * FROM dbo.AQ32RACE_Codes', 
    connection
)

# Closed responses to get which multi response answers are most frequent
df_closed = pd.read_sql(
    'SELECT * FROM dbo.Q32RACEMultiResponse', 
    connection
)

In [None]:
# clean data headers 
def clean_headers(df):
    df.columns = [x.lower().replace(' ','_') for x in df.columns]

In [None]:
clean_headers(df)
clean_headers(code_df)
clean_headers(df_closed)
df

In [None]:
# get a cleaned up column to use (mix of actual comment column and cleaned)
df['aq32race_combined'] = df.apply(
    lambda x: x.aq32race.lower() if x.aq32race_cleaned == None or x.aq32race_cleaned=='105' else x.aq32race_cleaned.lower(), 
    axis=1
)

df.head()

In [None]:
code_df

In [None]:
def split_description(description):

    # check for NULLs
    if description is None:
        return []
        
    # split string based on comma delimiters, as well as words in brackets
    desc_list = re.split(r'\sand\s|\sor\s|[,()\r\n]+', description)

    # lower case, remove extra characters and remove spaces
    desc_list = [x.lower().replace('"', '').replace('_', '').strip(' ') for x in desc_list]

    # remove descriptors that are empty
    desc_list = [x for x in desc_list if x!='']

    return desc_list

In [None]:
# long form of all possible descriptors used

code_dict_long = { 'code': [], 'description': [] }

for idx, row in code_df.iterrows():
    code = row.q_code

    qc_desc = split_description(row.qc_desc)
    qc_desc_notes = split_description(row.qc_desc_notes)
    additional_notes = split_description(row.additional_notes)

    all_desc = qc_desc + qc_desc_notes + additional_notes

    # remove duplicates 
    all_desc = [*set(all_desc)]
    
    n_desc = len(all_desc)

    if n_desc==0:
        continue

    # append to dictionary
    code_dict_long['code'].extend([code]*n_desc)
    code_dict_long['description'].extend(all_desc)

code_df_long = pd.DataFrame(code_dict_long)



In [None]:
code_df_long[code_df_long.code=='105']

In [None]:
code_df_long[code_df_long.description.str.contains('/')]

In [None]:
code_df_long.groupby('description').count().reset_index().sort_values(by='code', ascending=False)

In [None]:
code_df_long[code_df_long.description=='comment']

In [None]:
response ='canadian'
response = response.lower()

tmp = code_df_long.copy()
tmp['ratio'] = code_df_long.description.apply(lambda x: fuzz.ratio(x, response))
tmp['partial'] = code_df_long.description.apply(lambda x: fuzz.partial_ratio(x, response))
tmp['sort'] = code_df_long.description.apply(lambda x: fuzz.token_sort_ratio(x, response))
tmp['set'] = code_df_long.description.apply(lambda x: fuzz.token_set_ratio(x, response))
tmp['id'] = code_df_long.code + '_' + code_df_long.description

#tmp = pd.melt(tmp, id_vars = ['description'], value_vars=['ratio', 'partial', 'sort', 'set'])
tmp

In [None]:
def get_scores(response, code_df_long, as_df = False):
    
    response = response.lower()

    tmp = code_df_long.copy()
    tmp['ratio'] = code_df_long.description.apply(lambda x: fuzz.ratio(x, response))
    tmp['partial'] = code_df_long.description.apply(lambda x: fuzz.partial_ratio(x, response))
    tmp['sort'] = code_df_long.description.apply(lambda x: fuzz.token_sort_ratio(x, response))
    tmp['set'] = code_df_long.description.apply(lambda x: fuzz.token_set_ratio(x, response))
    tmp['id'] = code_df_long.code + '_' + code_df_long.description
    
    tmp = pd.melt(tmp, id_vars = ['id'], value_vars=['ratio', 'partial', 'sort', 'set'])
    
    tmp['col_id'] = tmp.id + '_' + tmp.variable
    #tmp = pd.pivot_table(tmp, columns=['description', 'variable'], values=['value']).reset_index(drop=True)

    #tmp.columns = ['_'.join(col) for col in tmp.columns]
    #tmp = tmp.rename_axis(None, axis=1)
    #cols = tmp.columns
    #tmp['response'] = response
    #tmp = tmp[['response'] + list(cols)]

    tmp = tmp[['col_id', 'value']]
    
    if as_df:
        return tmp
    else:
        return tmp.value

In [None]:
test = get_scores('canadian', code_df_long)
test

In [None]:
#%%timeit
df.iloc[0:10, :].aq32race_combined.apply(lambda x: get_scores(x, code_df_long, as_df=False))

In [None]:
df.head()

In [None]:
get_scores('Canadian', code_df_long, as_df = True)

In [None]:
bb = df.iloc[0:10,:].aq32race_combined.apply(lambda x: get_scores(x, code_df_long, as_df=False))
headers = list(get_scores('test', code_df_long, as_df = True).col_id.values)
bb.columns = headers
bb['response'] = df.iloc[0:10,:].aq32race_combined
new_cols = ['response'] + headers
bb = bb[new_cols]
bb.shape

In [None]:
import time
start = time.time()
train_df = df.aq32race_combined.apply(lambda x: get_scores(x, code_df_long, as_df=False))
# get headers for input data 
headers = list(get_scores('test', code_df_long, as_df = True).col_id.values)
train_df.columns = headers
train_df['response'] = df.aq32race_combined
train_df = train_df[['response'] + headers]
display(train_df.head())
end = time.time()
print((end - start)/60)

In [None]:
df.shape

In [None]:
# create testing df
# converts the coded columns into wide form 1/0 binary responses for every option 
code_list = code_df_long.code.unique()
output_length = len(code_list)

test_df = pd.DataFrame(columns = ['response'] + list(code_list))

for idx, row in df.iterrows():
    response = row.aq32race_combined
    code_vals = [0]*len(code_list)
    for ii in range(1,17):
        column = f'q32race_c{ii:02}'
        possible_code = row[column]
        if possible_code is None:
            continue
        else:
            idx_option = np.where(code_list==possible_code)[0]
            if len(idx_option)>0:
                code_vals[idx_option[0]] = 1

    tmp_df = pd.DataFrame(np.array([response] + code_vals).reshape(1, -1), columns = ['response'] + list(code_list))
    test_df = pd.concat([test_df, tmp_df]).reset_index(drop=True)

test_df.iloc[:, 1:] = test_df.iloc[:, 1:].astype(int)
test_df

In [None]:
test_df.drop('response', axis=1).sum().sort_values(ascending=False)

In [None]:
train_df.columns[1:]

In [None]:
headers

In [None]:
for item in train_df.columns[1:]:
    if item not in headers:
        print(item)

print('-')
for item in headers:
    if item not in train_df.columns[1:]:
        print(item)
        

In [None]:
# create synthetic data
# this section will create snythetic data that matches a single category based on available phrases 
code_counts = test_df.drop('response', axis=1).sum().sort_values(ascending=False)
n_codes = len(code_counts)
max_counts = code_counts.values[0]
extra_test_df = pd.DataFrame(columns = test_df.columns)
extra_train_df = pd.DataFrame(columns = train_df.columns)

for idx, val in code_counts.items():
    print()
    print_string = f'Code: {idx} -- Observations: {val}'
    print(print_string, end='\r')

    # don't add any more to biggest class 
    if val == max_counts:
        continue
        
    else:
        if idx=='Human':
            continue
        idx = idx.strip(' ')
        
        # find all words associated with that index
        desc_list = code_df_long[code_df_long.code==idx].description.values
        code_vals = [0]*len(code_list)

        # locate index of this code in code list 
        code_idx = np.where(code_list==idx)[0]
        
        if len(code_idx) == 0:
            continue

        n_more_counts = max_counts - val

        # randomly select synthetic data
        random_df = pd.DataFrame(columns = ['response'], data = random.choices(desc_list, k=n_more_counts))

        # create outputs
        code_vals = np.zeros((n_more_counts, n_codes))
        code_vals[:, code_idx] = 1
        output_df = pd.DataFrame(columns = list(code_list), data= code_vals).astype(int)
        output_df = random_df.merge(output_df, left_index=True, right_index=True)

        # create inputs
        input_df = random_df.response.apply(lambda x: get_scores(x, code_df_long, as_df=False))
        input_df.columns = headers
        input_df['response'] = random_df.response
        input_df = input_df[['response'] + headers]

        # append to extra synthetic df
        extra_test_df = pd.concat([extra_test_df, output_df]).reset_index(drop=True)
        extra_train_df = pd.concat([extra_train_df, input_df]).reset_index(drop=True)

        print_string = f'Code: {idx} -- Observations: {val} + {n_more_counts}. Done.'
        print(print_string, end='\r')
            

In [None]:
extra_test_df.head()

In [None]:
extra_test_df.shape

In [None]:
extra_train_df.head()

In [None]:
extra_train_df.shape

In [None]:
test_df

In [None]:
# Extract existing combinations from test_df
code_columns = test_df.iloc[:, 1:]
multi_response_freq_test = test_df[code_columns.sum(axis=1) > 1].drop('response', axis=1).apply(lambda x: tuple(x.index[x == 1]), axis=1)
multi_response_freq_test = multi_response_freq_test.value_counts().reset_index()
multi_response_freq_test.columns = ['combination', 'frequency']

# Extract combinations from df_closed
df_closed['combination'] = df_closed['q32race'].apply(lambda x: tuple(x.split('µ')))
multi_response_freq_closed = df_closed['combination'].value_counts().reset_index()
multi_response_freq_closed.columns = ['combination', 'frequency']

# Merge the frequency distributions
multi_response_freq = pd.concat([multi_response_freq_test, multi_response_freq_closed])
multi_response_freq = multi_response_freq.groupby('combination').sum().reset_index()

# Normalize frequency for probability
multi_response_freq['frequency'] /= multi_response_freq['frequency'].sum()

# Initialize dataframes
mixed_test_df = pd.DataFrame(columns=test_df.columns)
mixed_train_df = pd.DataFrame(columns=train_df.columns)

# Define parameters
n_mixed = 50_000

# Iterate to create mixed synthetic data
for jj in range(n_mixed):
    pct_done = int(100*(jj+1)/n_mixed)
    print_str = f'{jj+1:05}/{n_mixed}' + '  |' + '-'*pct_done + '>' + ' '*(100-pct_done-1) + '|'
    print(print_str, end='\r')
    # Choose a random combination based on frequency
    combination = np.random.choice(multi_response_freq['combination'], p=multi_response_freq['frequency'])
    code_vals = [0] * len(code_list)
    phrase_list = []

    for code in combination:
        code_idx = np.where(code_list == code)[0]
        if len(code_idx) == 0:
            continue

        code_vals[code_idx[0]] = 1
        desc_list = code_df_long[code_df_long.code == code].description.values
        random_code_phrase = random.choice(desc_list)
        phrase_list.append(random_code_phrase)

    phrase = ' '.join(phrase_list)
    mixed_test_df.loc[len(mixed_test_df)] = [phrase] + code_vals

# Convert categories to int
mixed_test_df[mixed_test_df.columns[1:]] = mixed_test_df.iloc[:, 1:].astype(int)

In [None]:
# Get training values using existing get_scores function
start = time.time()
mixed_train_df = mixed_test_df.response.apply(lambda x: get_scores(x, code_df_long, as_df=False))
mixed_train_df.columns = headers
mixed_train_df['response'] = mixed_test_df.response
mixed_train_df = mixed_train_df[['response'] + headers]

end = time.time()
print((end - start) / 60)

In [None]:
multi_response_freq_closed

In [None]:
multi_response_freq_test

In [None]:
# Concatenate with existing data
final_train_data = pd.concat([train_df, extra_train_df, mixed_train_df], ignore_index=True).drop('response', axis=1).astype(int)
final_test_data = pd.concat([test_df, extra_test_df, mixed_test_df], ignore_index=True).drop('response', axis=1).astype(int)

In [None]:
final_test_data.head()

In [None]:
final_train_data.head()

In [None]:
final_test_data.shape

In [None]:
final_train_data.shape

In [None]:
clf = RandomForestClassifier(random_state = 0, verbose=1).fit(final_train_data, final_test_data)

In [None]:
def list_classes(sentence, code_list, code_df, code_df_long, top_n = 10, min_pct = 0.05):

    test_input = get_scores(sentence, code_df_long).drop('response', axis=1)
    test_out = clf.predict_proba(test_input)
    for idx, item in enumerate(test_out):
        if item.shape[1] == 1:
            test_out[idx] = np.hstack((test_out[idx], 0*test_out[idx]))

    test_out = [x[:, 1] for x in test_out]
    test_out = np.array(test_out).T

    predictions = test_out[0]
    ordered_idx = np.argsort(predictions)[::-1]
    print()
    print(f'TOP MATCHES FOR: {sentence}')
    print()
    for counter, idx in enumerate(ordered_idx):
        if counter>=top_n:
            break
        else:
            prob = predictions[idx]

            if prob < min_pct:
                break
                
            code = code_list[idx]
            desc = code_df.loc[code_df['q_code'] == code, 'qc_desc'].values[0]
            print(f'{prob:0.2%}')
            print(desc)
            print()

In [None]:
# typo autocorrection packages
#from symspellpy import SymSpell
from autocorrect import Speller
#from textblob import TextBlob

In [None]:
#Use autocorrect pacakage to correct typos
def correct_spelling(sentence):
    spell = Speller()
    corrected_sentence = spell(sentence)
    return corrected_sentence

In [None]:
def get_scores_from_df(response_df, response_column, code_df_long, headers=None):
    if headers is None:
        headers = list(get_scores('test', code_df_long, as_df = True).col_id.values)
    else:
        # only want the non 'response' columns from an input list of headers
        if headers[0] == 'response':
            headers = headers[1:]

    df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))
    df.columns = headers
    df['response'] = response_df[response_column]
    df = df[['response'] + headers]
    
    return df

In [None]:
get_scores_from_df(pd.DataFrame({'response': [sentence]}), 'response', code_df_long, headers=final_train_data.columns)

In [None]:
sentence = 'canadien, americn, britsh'
corrected_sentence = correct_spelling(sentence)
list_classes(corrected_sentence, code_list, code_df, code_df_long)

In [None]:
#try using textblob to autocorrect typos
sentence = "Blanche canadienne"
corrected_sentence = str(TextBlob(sentence).correct())
list_classes(corrected_sentence, code_list, code_df, code_df_long)

In [None]:
#try self-defining dictionary for correct label names
with open('custom_dictionary.txt', 'w', encoding='utf-8') as file:
    for category in code_df_long['description']:
        file.write(f"{category} 1\n")

In [None]:
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary('custom_dictionary.txt', term_index=0, count_index=1)


In [None]:
sentence = 'Canadian, white, scot/Irish back ground'
corrected_sentence = sym_spell.lookup_compound(sentence, max_edit_distance=2)[0].term
list_classes(corrected_sentence, code_list, code_df, code_df_long)

In [None]:
# Use raw data to test the model
df_validation = pd.read_sql(
    'SELECT * FROM dbo.AQ32RACE_TEST WHERE Cycle=1', 
    connection
)

In [None]:
df_validation

In [None]:
def list_classes_code(sentence, code_list, code_df, code_df_long, top_n=10, min_pct=0.05):
    results = {}
    test_input = get_scores(sentence, code_df_long).drop('response', axis=1)
    test_out = clf.predict_proba(test_input)
    for idx, item in enumerate(test_out):
        if item.shape[1] == 1:
            test_out[idx] = np.hstack((test_out[idx], 0 * test_out[idx]))

    test_out = [x[:, 1] for x in test_out]
    test_out = np.array(test_out).T

    predictions = test_out[0]
    ordered_idx = np.argsort(predictions)[::-1]

    for counter, idx in enumerate(ordered_idx):
        if counter >= top_n:
            break
        else:
            prob = predictions[idx]

            if prob < min_pct:
                break

            code = code_list[idx]
            desc = code_df.loc[code_df['q_code'] == code, 'q_code'].values[0] # Changing it to code instead of description
            results[desc] = prob * 100  # Storing the result as a percentage

    return results


In [None]:
# Set a score threshold
score_threshold = 50

# Create a DataFrame to store the results
column_names = ['ID'] + ['Q32RACE_C' + str(i).zfill(2) for i in range(1, 17)]
results_df = pd.DataFrame(columns=column_names)

# Iterate through df_validation
for index, row in df_validation.iterrows():
    sentence = row['AQ32RACE']
    corrected_sentence = sym_spell.lookup_compound(sentence, max_edit_distance=2)[0].term
    categories = list_classes_code(corrected_sentence, code_list, code_df, code_df_long)
    
    # Filter by score and append to results
    filtered_categories = [cat for cat, score in categories.items() if score > score_threshold]
    if filtered_categories:
        result_row = [row['ID']] + filtered_categories + [None] * (16 - len(filtered_categories))
        results_df.loc[len(results_df)] = result_row

In [None]:
results_df.head(100)

In [None]:
# Merge the DataFrames on the ID column
combined_df = pd.merge(results_df, df, on='ID', suffixes=('_results', '_df'))

# Initialize a DataFrame to store differences
differences = pd.DataFrame()

# Iterate through the columns and compare
for i in range(1, 17):
    col_name = f'Q32RACE_C{i:02d}'
    differences[col_name] = combined_df[col_name + '_results'] != combined_df[col_name + '_df']

# Optional: Filter to rows with differences
differences['ID'] = combined_df['ID']
differences = differences[differences.any(axis=1)]

# The differences DataFrame now contains a True/False value for each comparison, with True indicating a difference.


In [None]:
false_count = (differences['Q32RACE_C01'] == False).sum()
total_count = differences['Q32RACE_C01'].count()

ratio = false_count / total_count
ratio

In [None]:
# Return the results to database

from sqlalchemy import create_engine
import urllib

# Read the credentials from your file
connection_str = ''
with open(cred_path) as infile:
    for line in infile:
        connection_str += line.strip('\n')

# Create a URL for SQLAlchemy's engine
params = urllib.parse.quote_plus(connection_str)
engine = create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)

# Write DataFrame back to the database 
results_df.to_sql('AQ32RACE_Result', con=engine, if_exists='replace', index=False) # if_exists can be 'append' if want to add to an existing table