In [None]:
# system stuff
import re
import os
import random

# connection stuff
import pyodbc

# standard stuff
import pandas as pd
import numpy as np

# nlp stuff
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# ml stuff
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
match = 'afro-canadian'
test = 'canadian'

print(fuzz.ratio(match, test))
print(fuzz.partial_ratio(match, test))
print(fuzz.token_sort_ratio(match, test))
print(fuzz.token_set_ratio(match, test))

In [None]:
cred_path = '../credentials.txt'

connection_str = ''
with open(cred_path) as infile:
    for line in infile:
        connection_str += line.strip('\n')

In [None]:
connection = pyodbc.connect(connection_str)

In [None]:
# read in data
df = pd.read_sql(
    'SELECT * FROM dbo.AQ32RACE WHERE Cycle=1', 
    connection
)

code_df = pd.read_sql(
    'SELECT * FROM dbo.AQ32RACE_Codes', 
    connection
)

In [None]:
# clean data headers 
def clean_headers(df):
    df.columns = [x.lower().replace(' ','_') for x in df.columns]



In [None]:
clean_headers(df)
clean_headers(code_df)

In [None]:
# get a cleaned up column to use (mix of actual comment column and cleaned)
df['aq32race_combined'] = df.apply(
    lambda x: x.aq32race.lower() if x.aq32race_cleaned == None or x.aq32race_cleaned=='105' else x.aq32race_cleaned.lower(), 
    axis=1
)

df.head()

In [None]:
def split_description(description):

    # check for NULLs
    if description is None:
        return []
        
    # split string based on comma delimiters, as well as words in brackets
    desc_list = re.split(r'\sand\s|\sor\s|[,()\r\n]+', description)

    # lower case, remove extra characters and remove spaces
    desc_list = [x.lower().replace('"', '').replace('_', '').strip(' ') for x in desc_list]

    # remove descriptors that are empty
    desc_list = [x for x in desc_list if x!='']

    return desc_list

In [None]:
# long form of all possible descriptors used

code_dict_long = { 'code': [], 'description': [] }

for idx, row in code_df.iterrows():
    code = row.q_code

    qc_desc = split_description(row.qc_desc)
    qc_desc_notes = split_description(row.qc_desc_notes)
    additional_notes = split_description(row.additional_notes)

    all_desc = qc_desc + qc_desc_notes + additional_notes

    # remove duplicates 
    all_desc = [*set(all_desc)]
    
    n_desc = len(all_desc)

    if n_desc==0:
        continue

    # append to dictionary
    code_dict_long['code'].extend([code]*n_desc)
    code_dict_long['description'].extend(all_desc)

code_df_long = pd.DataFrame(code_dict_long)



In [None]:
code_df_long[code_df_long.code=='105']

In [None]:
code_df_long[code_df_long.description.str.contains('/')]

In [None]:
code_df_long

In [None]:
def get_scores(response, code_df_long, n_limit=16):
    
    response = response.lower()

    tmp = code_df_long.copy()
    tmp['ratio'] = code_df_long.description.apply(lambda x: fuzz.ratio(x, response))
    tmp['partial'] = code_df_long.description.apply(lambda x: fuzz.partial_ratio(x, response))
    tmp['sort'] = code_df_long.description.apply(lambda x: fuzz.token_sort_ratio(x, response))
    tmp['set'] = code_df_long.description.apply(lambda x: fuzz.token_set_ratio(x, response))
    
    tmp = pd.melt(tmp, id_vars = ['description'], value_vars=['ratio', 'partial', 'sort', 'set'])
    tmp = pd.pivot_table(tmp, columns=['description', 'variable'], values=['value']).reset_index(drop=True)

    tmp.columns = ['_'.join(col) for col in tmp.columns]
    tmp = tmp.rename_axis(None, axis=1)
    cols = tmp.columns
    tmp['response'] = response
    tmp = tmp[['response'] + list(cols)]
    
    return tmp

In [None]:
test = get_scores('canadian', code_df_long)
test

In [None]:
train_df = df.apply(lambda x: get_scores(x.aq32race_combined, code_df_long), axis=1)
train_df = pd.concat(train_df.values).reset_index(drop=True)
train_df

In [None]:
# create testing df
# converts the coded columns into wide form 1/0 binary responses for every option 
code_list = code_df_long.code.unique()
output_length = len(code_list)

test_df = pd.DataFrame(columns = ['response'] + list(code_list))

for idx, row in df.iterrows():
    response = row.aq32race_combined
    code_vals = [0]*len(code_list)
    for ii in range(1,17):
        column = f'q32race_c{ii:02}'
        possible_code = row[column]
        if possible_code is None:
            continue
        else:
            idx_option = np.where(code_list==possible_code)[0]
            if len(idx_option)>0:
                code_vals[idx_option[0]] = 1

    tmp_df = pd.DataFrame(np.array([response] + code_vals).reshape(1, -1), columns = ['response'] + list(code_list))
    test_df = pd.concat([test_df, tmp_df]).reset_index(drop=True)

test_df.iloc[:, 1:] = test_df.iloc[:, 1:].astype(int)
test_df

In [None]:
test_df.drop('response', axis=1).sum().sort_values(ascending=False)

In [None]:
# create synthetic data
# this section will create snythetic data that matches a single category based on available phrases 
code_counts = test_df.drop('response', axis=1).sum().sort_values(ascending=False)
n_codes = len(code_counts)
max_counts = code_counts.values[0]
extra_test_df = pd.DataFrame(columns = test_df.columns)
extra_train_df = pd.DataFrame(columns = train_df.columns)

for idx, val in code_counts.items():
    print()
    print_string = f'Code: {idx} -- Observations: {val}'
    print(print_string, end='\r')

    # don't add any more to biggest class 
    if val == max_counts:
        continue
        
    else:
        if idx=='Human':
            continue
        idx = idx.strip(' ')
        
        # find all words associated with that index
        desc_list = code_df_long[code_df_long.code==idx].description.values
        code_vals = [0]*len(code_list)

        # locate index of this code in code list 
        code_idx = np.where(code_list==idx)[0]
        n_more_counts = max_counts - val
        
        if len(code_idx) == 0:
            continue

        # randomly select synthetic data
        random_df = pd.DataFrame(columns = ['response'], data = random.choices(desc_list, k=n_more_counts))

        # create outputs
        code_vals = np.zeros((n_more_counts, n_codes))
        code_vals[:, code_idx] = 1
        output_df = pd.DataFrame(columns = list(code_list), data= code_vals).astype(int)
        output_df = random_df.merge(output_df, left_index=True, right_index=True)

        # create inputs
        input_df = random_df.apply(lambda x: get_scores(x.response, code_df_long), axis=1)
        input_df = pd.concat(input_df.values).reset_index(drop=True)

        # append to extra synthetic df
        extra_test_df = pd.concat([extra_test_df, output_df]).reset_index(drop=True)
        extra_train_df = pd.concat([extra_train_df, input_df]).reset_index(drop=True)

        print_string = f'Code: {idx} -- Observations: {val} + {n_more_counts}. Done.'
        print(print_string, end='\r')
            

In [None]:
extra_test_df.head()

In [None]:
extra_test_df.shape

In [None]:
extra_train_df.head()

In [None]:
extra_train_df.shape

In [None]:
train_with_copies_x = pd.concat([train_df, extra_train_df]).drop('response', axis=1).astype(int)
train_with_copies_y = pd.concat([test_df, extra_test_df]).drop('response', axis=1).astype(int)

In [None]:
train_with_copies_y.sum()

In [None]:
train_with_copies_x.iloc[0, 4]

In [None]:
train_with_copies_y.shape

In [None]:
clf = RandomForestClassifier(random_state = 0).fit(train_with_copies_x, train_with_copies_y)

In [None]:
def list_classes(sentence, code_list, code_df, code_df_long, top_n = 10, min_pct = 0.05):

    test_input = get_scores(sentence, code_df_long).drop('response', axis=1)
    test_out = clf.predict_proba(test_input)
    for idx, item in enumerate(test_out):
        if item.shape[1] == 1:
            test_out[idx] = np.hstack((test_out[idx], 0*test_out[idx]))

    test_out = [x[:, 1] for x in test_out]
    test_out = np.array(test_out).T

    predictions = test_out[0]
    ordered_idx = np.argsort(predictions)[::-1]
    print()
    print(f'TOP MATCHES FOR: {sentence}')
    print()
    for counter, idx in enumerate(ordered_idx):
        if counter>=top_n:
            break
        else:
            prob = predictions[idx]

            if prob < min_pct:
                break
                
            code = code_list[idx]
            desc = code_df.loc[code_df['q_code'] == code, 'qc_desc'].values[0]
            print(f'{prob:0.2%}')
            print(desc)
            print()

In [None]:
sentence = 'canadien, british, african american'
list_classes(sentence, code_list, code_df, code_df_long)