Copyright 2025 Province of British Columbia

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and limitations under the License.

### Data Cleaning 

create a column that will work with AZ ML Workspace

this was further updated to create a JSON file that works with the Language Studio service as well as producing the individual text files that are required for that service. 


In [None]:
# system stuff
import sys
import os
import json

import pandas as pd

# my stuff (abstracted non-important functions)
# Get the project root (one level up from notebooks)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from src.config import data_path_rvm
from src.prepare_data import create_train_test_dataframes

In [None]:
# create a single column to use as target for Azure
def create_target(row):
    options = row[1:]
    # find where true
    out = ','.join(['"'+x+'"' for x in options[options=='X'].index])
    if len(out)==0:
        out = '["Other"]'
    else:
        out = '['+out+']'
    return out

In [None]:
df = pd.read_excel(data_path_rvm, sheet_name = 'Q07a')
df = df.iloc[:, 4:-1]
df.columns = ['Response'] + list(df.columns[1:])
df.head()

In [None]:
df['target'] = df.apply(create_target, axis=1)
df.head()

In [None]:
# remove multi-line characters and NA responses
df = df[~pd.isna(df['Response'])]
df['Response'] = df['Response'].apply(lambda x: x.replace('\n',' '))
df.head()

In [None]:
# split so that it only trains on a subset 
n_train = 1_000
df_train, df_test = create_train_test_dataframes(df, n_train=n_train)

In [None]:
# pull another 1_000 for validation
df_validate, df_test = create_train_test_dataframes(df_test, n_train=n_train)

In [None]:
# save these for upload to the workspace
# after removing the int columns
df_train[['Response', 'target']].to_csv(data_path_rvm.replace('.xlsx', '_train.csv'), index=False)
df_validate[['Response', 'target']].to_csv(data_path_rvm.replace('.xlsx', '_validate.csv'), index=False)
df_test[['Response', 'target']].to_csv(data_path_rvm.replace('.xlsx', '_test.csv'), index=False)

In [None]:
def clean_class(x):
    # class categories for the language service must not contain special characters
    # and must be less than 50 characters
    return x.replace(':','').replace('/','')[0:50]

In [None]:
# save files for ai language service, as well as a json file that has all of the correct categories
def get_classes_dict(row, outpath):
    filename = f'text-{row.name:04d}.txt'
    response = row['Response']
    options = row[1:-1]
    # find where true
    out = [x for x in options[options=='X'].index]

    if len(out) == 0:
        out = ['None']
    
    doc_dict = {
        'location': filename,
        'language': 'en-us',
        'classes': [{'category': clean_class(x)} for x in out]
    }

    # save file
    with open(os.path.join(outpath, filename), 'w', encoding="utf-8") as outfile:
        outfile.write(response)

    # return dictionary to append to json
    return doc_dict
    

In [None]:
outpath = os.path.join(os.path.dirname(data_path_rvm), 'language-ai')
outpath_test = os.path.join(outpath, 'test')

In [None]:
labels_dict = {
    "projectFileVersion": "2022-05-01", # Don't change this date
    "stringIndexType": "Utf16CodeUnit",
    "metadata": {
        "projectKind": "CustomMultiLabelClassification",
        "storageInputContainerName": "language-demo-lf", # match to your container!
        "projectName": "language-demo-lf", # match to your project!
        "multilingual": 'false', # choose which matches your data
        "description": "Testing the RBCM data using the AI language service",
        "language": "en",
        "settings": {}
    },
    'assets':{
        "projectKind": "CustomMultiLabelClassification",
        'classes':[{'category': clean_class(x)} for x in list(df_train.columns[1:-1]) + ['None']],
        'documents': []
    }
}

In [None]:
labels_dict['assets']['classes']

In [None]:
# save all train files
labels_train = labels_dict.copy()
for ii, row in df_train.iterrows():
    single_doc_dict = get_classes_dict(row, outpath)
    labels_train['assets']['documents'].append(single_doc_dict)

with open(os.path.join(outpath, 'rbcmLabels_v5.json'), 'w') as json_file:
    json.dump(labels_train, json_file, indent=4)

In [None]:
# save all test files
labels_test = labels_dict.copy()
for ii, row in df_validate.iterrows():
    single_doc_dict = get_classes_dict(row, outpath_test)
    labels_test['assets']['documents'].append(single_doc_dict)

with open(os.path.join(outpath_test, 'rbcmLabels_test.json'), 'w') as json_file:
    json.dump(labels_test, json_file, indent=4)