Copyright 2023 Province of British Columbia

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at 

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under he License.
the License.

## Imports and Parameters

In [1]:
# add our stuff to the path
import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# other stuff
import time
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', None)

# import our stuff
from importlib import reload
from src import connect, preprocess, synthetic, model

In [2]:
# PARAMETERS

# info to gain access to database, IDIR restricted 
CRED_PATH = '../credentials.txt'

# where model is stored. requires credentials.txt for full path 
MODEL_BASE_PATH = 'Model/Q22'

# which tables to access
RESPONSE_TABLE = 'dbo.AQ22ANCES'
CODE_TABLE = 'dbo.AQ22ANCES_Codes'
CLOSED_TABLE = 'dbo.Q22ANCESMultiResponse'
RESULTS_TABLE = 'dbo.AQ22ANCES_RESULTS'

# which column to use/create 
RESPONSE_COLUMN = 'aq22_cleaned'
OUTPUT_COLUMNS = 'q22ances_c'
N_COLUMNS = 5

# delimiter to send back with concatenated results
DELIMITER = 'μ' 

# amount of synthetic data to produce
N_PER_CATEGORY = 10_000
N_SYNTHETIC_MIXED = 200_000

# threshold for accepting as a flagged category
THRESHOLD = 0.5

# upper and lower thresholds for flagging as a possible category
TENTATIVE_UPPER = 0.75
TENTATIVE_LOWER = 0.25

## Read from Database

In [3]:
# Read in all data required to build model
connection = connect.create_connection(CRED_PATH)

# actual responses
df_open = connect.fetch_table(RESPONSE_TABLE, connection)

# codes to match
code_df = connect.fetch_table(CODE_TABLE, connection)

# closed respones (for multi response frequencies)
df_closed = connect.fetch_table(CLOSED_TABLE, connection)

  df = pd.read_sql(f'SELECT * FROM {table_name}', connection)


In [4]:
df_open.head()

Unnamed: 0,id,q22ances1,aq22ances1,q22ances_c01,q22ances_c02,q22ances_c03,q22ances_c04,q22ances_c05,q22ances2,aq22ances2,...,q22ances_c26,q22ances_c27,q22ances_c28,cycle,coding_comment,aq22ances1_cleaned,aq22ances2_cleaned,aq22ances3_cleaned,aq22ances4_cleaned,aq22ances5_cleaned
0,39521,,,,,,,,,,...,,,,2,,,,,,
1,39522,,,,,,,,,,...,,,,2,,,,,,
2,39525,,,,,,,,,,...,,,,2,,,,,,
3,39527,,,,,,,,,,...,,,,2,,,,,,
4,39529,,,,,,,,,,...,,,,2,,,,,,


In [5]:
# Reshaping the dataframe
df_reshaped = preprocess.reshape_df(df_open)

In [6]:
# filter to only cycle 1 for now 
df_reshaped[df_reshaped.cycle == 1].head()

Unnamed: 0,id,cycle,q22ances,aq22ances,q22ances_c01,q22ances_c02,q22ances_c03,q22ances_c04,q22ances_c05,origin
1127,4,1,97,74% North Western European,42,,,,,4
7177,7,1,97,Germany,44,,,,,4
7238,11,1,24µ97,brazil,24,,,,,2
1445,13,1,97,"Scottland, England & Germany",42,44.0,,,,4
7425,15,1,97,Kenya,14,,,,,1


## Preprocess Data

* Use only cycle 1 for training.
* Use the cleaned column.
* Correct spelling where possible (takes longer time to run).
* Create input and output tables for model (input table takes a while to produce)

In [7]:
# for training purposes, open responses should only be those from cycle 1
df = df_reshaped[df_reshaped.cycle == 1].reset_index(drop=True)
df.head()

Unnamed: 0,id,cycle,q22ances,aq22ances,q22ances_c01,q22ances_c02,q22ances_c03,q22ances_c04,q22ances_c05,origin
0,4,1,97,74% North Western European,42,,,,,4
1,7,1,97,Germany,44,,,,,4
2,11,1,24µ97,brazil,24,,,,,2
3,13,1,97,"Scottland, England & Germany",42,44.0,,,,4
4,15,1,97,Kenya,14,,,,,1


In [8]:
# Use the cleaned column if available; otherwise, use the original column
df.loc[:, RESPONSE_COLUMN] = df.apply(
    lambda x: str(x.aq22ances).lower() if x.aq22ances is not None else np.nan, 
    axis=1
)

# Now only apply correct_spelling on non-null values
mask = df[RESPONSE_COLUMN].notnull()
df.loc[mask, RESPONSE_COLUMN] = df.loc[mask, RESPONSE_COLUMN].apply(preprocess.correct_spelling)

df.head()

Unnamed: 0,id,cycle,q22ances,aq22ances,q22ances_c01,q22ances_c02,q22ances_c03,q22ances_c04,q22ances_c05,origin,aq22_cleaned
0,4,1,97,74% North Western European,42,,,,,4,74% north western european
1,7,1,97,Germany,44,,,,,4,germany
2,11,1,24µ97,brazil,24,,,,,2,brazil
3,13,1,97,"Scottland, England & Germany",42,44.0,,,,4,"scotland, england & germany"
4,15,1,97,Kenya,14,,,,,1,kenya


In [9]:
reload(preprocess)

<module 'src.preprocess' from 'C:\\Users\\lfredric\\Work\\dev\\github_repos\\bcgov\\demo-nlp\\notebooks\\..\\src\\preprocess.py'>

In [10]:
# get long form table of codes: step 1: turn countries into rows
code_df_long_tmp = preprocess.get_long_form_codes(code_df)
code_df_long_tmp.head(20)

Unnamed: 0,code,code_desc,description
0,88,I'm not able to be more specific,i'm not able to be more specific
1,99,Prefer not to answer,prefer not to answer
2,80000,Comment,comment
3,11,North Africa,north africa
4,11,North Africa,libyan arab jamahiriya
5,11,North Africa,sudan
6,11,North Africa,algeria
7,11,North Africa,western sahara
8,11,North Africa,tunisia
9,11,North Africa,morocco


In [12]:
# step 2: add additional rows for each country (nationalities, other spellings, etc.)
code_df_long = preprocess.get_long_form_codes_q22(code_df_long_tmp)

In [104]:
code_df_long.head()

Unnamed: 0,code,code_desc,description
0,80000,Comment,comment
1,11,North Africa,north africa
2,11,North Africa,sudan
3,11,North Africa,sudanese
4,11,North Africa,algeria


In [14]:
# build a training dataset for the model 
# first part of dataset: actual data
code_df_long['code'] = code_df_long['code'].astype(str)
df[RESPONSE_COLUMN] = df[RESPONSE_COLUMN].astype(str)

# INPUTS TO MODEL
headers = list(preprocess.get_scores('test', code_df_long, as_df = True).col_id.values)

input_df = preprocess.get_scores_from_df(df, RESPONSE_COLUMN, code_df_long, headers=headers)
input_columns = list(input_df.columns)
input_df = preprocess.convert_input(input_df)
display(input_df.head())

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


Unnamed: 0,response,10,11,12,13,14,15,20,21,22,...,35,41,42,43,44,51,52,53,54,80
0,74% north western european,36.5,62.0,58.25,46.5,56.25,48.5,31.25,54.5,50.0,...,57.0,68.0,74.0,69.25,78.5,43.25,40.0,49.25,49.25,27.75
1,germany,44.25,69.5,69.5,51.75,53.5,44.5,23.75,62.0,53.75,...,71.0,62.0,69.0,57.0,100.0,43.5,46.5,56.25,53.5,29.0
2,brazil,24.0,48.25,51.25,50.0,44.5,59.25,34.75,44.5,52.25,...,64.5,49.0,40.0,57.0,34.5,50.0,33.0,52.5,38.75,0.0
3,"scotland, england & germany",39.25,52.25,59.75,62.0,62.75,49.75,30.0,58.75,58.5,...,68.0,60.0,100.0,58.25,89.25,58.75,53.5,47.5,58.0,26.25
4,kenya,34.75,45.5,61.75,45.0,100.0,47.0,28.75,51.5,61.75,...,62.5,42.25,55.0,49.5,48.5,47.5,42.25,40.0,55.0,35.75


In [107]:
preprocess.get_scores('test', code_df_long, as_df=False)

0       36
1       12
2       22
3       33
4       18
        ..
1955    22
1956    20
1957    40
1958     0
1959    35
Name: value, Length: 1960, dtype: int64

In [15]:
# OUTPUTS OF MODEL
# converts the coded columns into wide form 1/0 binary responses for every option 
output_df = preprocess.get_outputs_wide(df, RESPONSE_COLUMN, code_df_long, OUTPUT_COLUMNS, N_COLUMNS)
output_df.head()
output_columns = list(output_df.columns)

## Create Synthetic Data

* To augment our training data, produce synthetic data from the available phrases in the code list. 
* Both singular phrases and multi-response phrases are produced.
* The multi-response phrases are randomly generated according to weights associated with the non-written responses. 

In [16]:
reload(synthetic)

<module 'src.synthetic' from 'C:\\Users\\lfredric\\Work\\dev\\github_repos\\bcgov\\demo-nlp\\notebooks\\..\\src\\synthetic.py'>

In [17]:
# create synthetic data
# this section will create synthetic data that matches a single category based on available phrases 
extra_input_df, extra_output_df = synthetic.create_single_phrase_synthetic(
    output_df, 
    input_columns,
    output_columns,
    code_df_long,
    n_per_category = N_PER_CATEGORY,
    use_given = False
)


01/26 -- Code: 42 -- Observations: 3336

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


01/26 -- Code: 42 -- Observations: 0 + 10000. Done.
02/26 -- Code: 44 -- Observations: 2539

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


02/26 -- Code: 44 -- Observations: 0 + 10000. Done.
03/26 -- Code: 41 -- Observations: 1064

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


03/26 -- Code: 41 -- Observations: 0 + 10000. Done.
04/26 -- Code: 43 -- Observations: 540

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


04/26 -- Code: 43 -- Observations: 0 + 10000. Done.
05/26 -- Code: 21 -- Observations: 456

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


05/26 -- Code: 21 -- Observations: 0 + 10000. Done.
06/26 -- Code: 32 -- Observations: 206

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


06/26 -- Code: 32 -- Observations: 0 + 10000. Done.
07/26 -- Code: 33 -- Observations: 131

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


07/26 -- Code: 33 -- Observations: 0 + 10000. Done.
08/26 -- Code: 34 -- Observations: 129

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


08/26 -- Code: 34 -- Observations: 0 + 10000. Done.
09/26 -- Code: 24 -- Observations: 124

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


09/26 -- Code: 24 -- Observations: 0 + 10000. Done.
10/26 -- Code: 23 -- Observations: 96

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


10/26 -- Code: 23 -- Observations: 0 + 10000. Done.
11/26 -- Code: 35 -- Observations: 85

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


11/26 -- Code: 35 -- Observations: 0 + 10000. Done.
12/26 -- Code: 12 -- Observations: 58

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


12/26 -- Code: 12 -- Observations: 0 + 10000. Done.
13/26 -- Code: 15 -- Observations: 46

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


13/26 -- Code: 15 -- Observations: 0 + 10000. Done.
14/26 -- Code: 14 -- Observations: 40

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


14/26 -- Code: 14 -- Observations: 0 + 10000. Done.
15/26 -- Code: 11 -- Observations: 37

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


15/26 -- Code: 11 -- Observations: 0 + 10000. Done.
16/26 -- Code: 100 -- Observations: 27

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


16/26 -- Code: 100 -- Observations: 0 + 10000. Done.
17/26 -- Code: 201 -- Observations: 24

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


17/26 -- Code: 201 -- Observations: 0 + 10000. Done.
18/26 -- Code: 31 -- Observations: 21

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


18/26 -- Code: 31 -- Observations: 0 + 10000. Done.
19/26 -- Code: 22 -- Observations: 14

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


19/26 -- Code: 22 -- Observations: 0 + 10000. Done.
20/26 -- Code: 80000 -- Observations: 14

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


20/26 -- Code: 80000 -- Observations: 0 + 10000. Done.
21/26 -- Code: 13 -- Observations: 10

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


21/26 -- Code: 13 -- Observations: 0 + 10000. Done.
22/26 -- Code: 52 -- Observations: 9

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


22/26 -- Code: 52 -- Observations: 0 + 10000. Done.
23/26 -- Code: 51 -- Observations: 5

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


23/26 -- Code: 51 -- Observations: 0 + 10000. Done.
24/26 -- Code: 54 -- Observations: 3

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


24/26 -- Code: 54 -- Observations: 0 + 10000. Done.
25/26 -- Code: 202 -- Observations: 2

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


25/26 -- Code: 202 -- Observations: 0 + 10000. Done.
26/26 -- Code: 53 -- Observations: 0

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


26/26 -- Code: 53 -- Observations: 0 + 10000. Done.

In [18]:
# create synthetic data
# this section will create synthetic data that matches multiple categories
mixed_input_df, mixed_output_df = synthetic.create_multi_phrase_synthetic(
    output_df,
    df_closed,
    input_columns,
    output_columns,
    code_df_long,
    N_SYNTHETIC_MIXED
)

200000/200000  |---------------------------------------------------------------------------------------------------->|
Creating synthetic data outputs... Done.
Creating synthetic data inputs... 

  df = response_df[response_column].apply(lambda x: get_scores(x, code_df_long, as_df=False))


Done.


In [19]:
mixed_input_df.head()

Unnamed: 0,response,80000_comment_ratio,11_north africa_ratio,11_sudan_ratio,11_sudanese_ratio,11_algeria_ratio,11_algerian_ratio,11_western sahara_ratio,11_sahrawi_ratio,11_tunisia_ratio,...,54_samoan_set,54_french polynesia_set,54_french polynesian_set,54_cook islander_set,54_cook islands_set,54_tonga_set,54_tongan_set,201_middle east_set,202_hawaii_set,100_indeterminate_set
0,jan mayen islands luxembourg,17,25,18,28,23,28,29,17,23,...,35,23,27,39,74,18,24,26,12,39
1,saint pierre bulgaria swedish western europe,8,25,16,23,24,27,31,24,20,...,16,23,26,32,29,8,12,22,16,28
2,jersey monegasque,17,21,18,40,25,24,32,17,17,...,35,42,41,20,21,36,35,36,9,40
3,russian federation belgian,18,32,26,29,30,35,30,30,30,...,25,38,42,31,26,26,31,27,25,41
4,bulgarian northern ireland netherlands,18,36,19,22,27,30,35,18,18,...,14,26,29,31,28,14,14,24,14,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,hungarian dutch,18,37,30,26,36,43,28,27,36,...,29,39,44,36,37,40,48,23,29,36
199996,miquelon sark liechtensteiner,22,29,18,27,22,27,33,22,28,...,23,36,35,29,29,24,23,35,17,43
199997,norwegian luxembourger,21,29,15,27,34,33,28,21,21,...,29,37,41,29,29,30,36,18,14,29
199998,tajikistan china indonesian sri lankan azerbaijan,14,26,15,21,21,25,32,21,21,...,15,34,36,26,26,11,15,20,18,26


In [20]:
chunk_size = 1000
chunks_extra_input = [extra_input_df.iloc[i:i+chunk_size] for i in range(0, len(extra_input_df), chunk_size)]
chunks_mixed_input = [mixed_input_df.iloc[i:i+chunk_size] for i in range(0, len(mixed_input_df), chunk_size)]

   response 80000_comment_ratio 11_north africa_ratio 11_sudan_ratio  \
0  svalbard                   0                    20             31   
1  scotland                  40                    30             46   
2   finland                  14                    32             33   
3   swedish                  14                    11             33   
4   denmark                  29                    32             33   

  11_sudanese_ratio 11_algeria_ratio 11_algerian_ratio  \
0                25               40                38   
1                38               27                38   
2                27               29                40   
3                40               29                27   
4                27               29                27   

  11_western sahara_ratio 11_sahrawi_ratio 11_tunisia_ratio  ...  \
0                      36               40               27  ...   
1                      27               27               27  ...   
2             

In [22]:
processed_chunks_extra_input = []
n_chunks = len(chunks_extra_input)
for idx, chunk in enumerate(chunks_extra_input):
    try:
        processed_chunk = preprocess.convert_input(chunk)
        processed_chunks_extra_input.append(processed_chunk)
        print(f"Chunk {idx+1:02}/{n_chunks} processed successfully for extra_input!", end = '\r')
    except Exception as e:
        print(f"Error in chunk {idx} for extra_input: {e}")

extra_input_df = pd.concat(processed_chunks_extra_input, ignore_index=True)

Chunk 0 processed successfully for extra_input!
Chunk 1 processed successfully for extra_input!
Chunk 2 processed successfully for extra_input!
Chunk 3 processed successfully for extra_input!
Chunk 4 processed successfully for extra_input!
Chunk 5 processed successfully for extra_input!
Chunk 6 processed successfully for extra_input!
Chunk 7 processed successfully for extra_input!
Chunk 8 processed successfully for extra_input!
Chunk 9 processed successfully for extra_input!
Chunk 10 processed successfully for extra_input!
Chunk 11 processed successfully for extra_input!
Chunk 12 processed successfully for extra_input!
Chunk 13 processed successfully for extra_input!
Chunk 14 processed successfully for extra_input!
Chunk 15 processed successfully for extra_input!
Chunk 16 processed successfully for extra_input!
Chunk 17 processed successfully for extra_input!
Chunk 18 processed successfully for extra_input!
Chunk 19 processed successfully for extra_input!
Chunk 20 processed successfull

In [26]:
processed_chunks_mixed_input = []
n_chunks = len(chunks_mixed_input)
for idx, chunk in enumerate(chunks_mixed_input):
    try:
        processed_chunk = preprocess.convert_input(chunk)
        processed_chunks_mixed_input.append(processed_chunk)
        print(f"Chunk {idx+1:02}/{n_chunks} processed successfully for mixed_input!", end='\r')
    except Exception as e:
        print(f"Error in chunk {idx} for mixed_input: {e}")

mixed_input_df = pd.concat(processed_chunks_mixed_input, ignore_index=True)

Chunk 0 processed successfully for mixed_input!
Chunk 1 processed successfully for mixed_input!
Chunk 2 processed successfully for mixed_input!
Chunk 3 processed successfully for mixed_input!
Chunk 4 processed successfully for mixed_input!
Chunk 5 processed successfully for mixed_input!
Chunk 6 processed successfully for mixed_input!
Chunk 7 processed successfully for mixed_input!
Chunk 8 processed successfully for mixed_input!
Chunk 9 processed successfully for mixed_input!
Chunk 10 processed successfully for mixed_input!
Chunk 11 processed successfully for mixed_input!
Chunk 12 processed successfully for mixed_input!
Chunk 13 processed successfully for mixed_input!
Chunk 14 processed successfully for mixed_input!
Chunk 15 processed successfully for mixed_input!
Chunk 16 processed successfully for mixed_input!
Chunk 17 processed successfully for mixed_input!
Chunk 18 processed successfully for mixed_input!
Chunk 19 processed successfully for mixed_input!
Chunk 20 processed successfull

In [27]:
# Concatenate data
final_input_df = pd.concat(
    [
        #input_df, 
        extra_input_df, 
        mixed_input_df
    ], 
    ignore_index=True).drop('response', axis=1).astype(int)
final_output_df = pd.concat(
    [
        #output_df, 
        extra_output_df, 
        mixed_output_df
    ]
    , 
    ignore_index=True).drop('response', axis=1).astype(int)

In [28]:
final_input_df.shape

(460000, 25)

In [29]:
final_output_df.shape

(460000, 26)

## Train Model

To deal with the fact we have many possible categorical outputs, use a simple random forest model that will handle multiple outputs better than other models.

* After training, save model. 

In [91]:
reload(model)

<module 'src.model' from 'C:\\Users\\lfredric\\Work\\dev\\github_repos\\bcgov\\demo-nlp\\notebooks\\..\\src\\model.py'>

In [31]:
n_estimators = 100
clf = model.create_model(final_input_df, final_output_df, n_estimators = n_estimators)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  3.8min


In [56]:
model.save_model(CRED_PATH, MODEL_BASE_PATH, clf, code_df_long)

Model already exists in directory //decimal.idir.bcgov/BCDemoSurveyCoding/Model/Q22. Overwrite? (y)/n:  y


In [None]:
clf, code_df_long = model.load_model(CRED_PATH, MODEL_BASE_PATH)

## Produce Model Results

* Look at some results by hand to see if it makes sense
* Create results for entire hand-coded dataset
* Send results back to databse

In [103]:
sentence = 'chinese, english'
model.list_classes(sentence, code_df_long, clf, truncate_inputs=True, spellcheck=True)


TOP MATCHES FOR: chinese, english
CORRECTED TO:    chinese, english

97.00% (32)
East Asia

21.00% (42)
Northern Europe

20.00% (34)
South Asia

17.00% (44)
Western Europe

14.00% (41)
Eastern Europe

14.00% (35)
West Asia

12.00% (31)
Central Asia

12.00% (33)
Southeast Asia

7.00% (43)
Southern Europe

5.00% (54)
Polynesia



[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [65]:
df_open.head()

Unnamed: 0,id,q22ances1,aq22ances1,q22ances_c01,q22ances_c02,q22ances_c03,q22ances_c04,q22ances_c05,q22ances2,aq22ances2,...,q22ances_c26,q22ances_c27,q22ances_c28,cycle,coding_comment,aq22ances1_cleaned,aq22ances2_cleaned,aq22ances3_cleaned,aq22ances4_cleaned,aq22ances5_cleaned
0,39521,,,,,,,,,,...,,,,2,,,,,,
1,39522,,,,,,,,,,...,,,,2,,,,,,
2,39525,,,,,,,,,,...,,,,2,,,,,,
3,39527,,,,,,,,,,...,,,,2,,,,,,
4,39529,,,,,,,,,,...,,,,2,,,,,,


In [64]:
df.head()

Unnamed: 0,id,cycle,q22ances,aq22ances,q22ances_c01,q22ances_c02,q22ances_c03,q22ances_c04,q22ances_c05,origin,aq22_cleaned
0,4,1,97,74% North Western European,42,,,,,4,74% north western european
1,7,1,97,Germany,44,,,,,4,germany
2,11,1,24µ97,brazil,24,,,,,2,brazil
3,13,1,97,"Scottland, England & Germany",42,44.0,,,,4,"scotland, england & germany"
4,15,1,97,Kenya,14,,,,,1,kenya


In [66]:
input_df.head()

Unnamed: 0,response,10,11,12,13,14,15,20,21,22,...,35,41,42,43,44,51,52,53,54,80
0,74% north western european,36.5,62.0,58.25,46.5,56.25,48.5,31.25,54.5,50.0,...,57.0,68.0,74.0,69.25,78.5,43.25,40.0,49.25,49.25,27.75
1,germany,44.25,69.5,69.5,51.75,53.5,44.5,23.75,62.0,53.75,...,71.0,62.0,69.0,57.0,100.0,43.5,46.5,56.25,53.5,29.0
2,brazil,24.0,48.25,51.25,50.0,44.5,59.25,34.75,44.5,52.25,...,64.5,49.0,40.0,57.0,34.5,50.0,33.0,52.5,38.75,0.0
3,"scotland, england & germany",39.25,52.25,59.75,62.0,62.75,49.75,30.0,58.75,58.5,...,68.0,60.0,100.0,58.25,89.25,58.75,53.5,47.5,58.0,26.25
4,kenya,34.75,45.5,61.75,45.0,100.0,47.0,28.75,51.5,61.75,...,62.5,42.25,55.0,49.5,48.5,47.5,42.25,40.0,55.0,35.75


In [95]:
reload(model)

<module 'src.model' from 'C:\\Users\\lfredric\\Work\\dev\\github_repos\\bcgov\\demo-nlp\\notebooks\\..\\src\\model.py'>

In [96]:
results_df = model.produce_results(
    df, input_df, output_df, 
    clf,
    OUTPUT_COLUMNS,
    N_COLUMNS,
    question = 'Q22',
    threshold=THRESHOLD,
    tentative_lower = TENTATIVE_LOWER,
    tentative_upper = TENTATIVE_UPPER,
    delimiter = DELIMITER
)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.6s


10858/10858  |---------------------------------------------------------------------------------------------------->|

In [97]:
results_df.head()

Unnamed: 0,id,cycle,q22ances,aq22ances,origin,aq22_cleaned,q22ances_c01,q22ances_c02,q22ances_c03,q22ances_c04,q22ances_c05,match,original_matched,extra_categories,n_original_categories,n_model_categories,tentative_categories,outside_continent,inside_continent
0,4,1,97,74% North Western European,4,74% north western european,42,44.0,,,,0,1.0,1,1,2,42.0,0,1
1,7,1,97,Germany,4,germany,44,,,,,1,1.0,0,1,1,,0,1
2,11,1,24µ97,brazil,2,brazil,24,,,,,1,1.0,0,1,1,,0,1
3,13,1,97,"Scottland, England & Germany",4,"scotland, england & germany",42,44.0,,,,1,1.0,0,2,2,,0,1
4,15,1,97,Kenya,1,kenya,14,,,,,1,1.0,0,1,1,,0,1


In [None]:
# save back to database
engine = connect.create_connection(CRED_PATH, sqlalchemy=True)

# for initial save of cycle 1, always replace. any subsequent inputs should be appended
connect.save_table(results_df, RESULTS_TABLE, engine, how='replace')