Copyright 2023 Province of British Columbia

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at 

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

## 1. Install packages and Load dataset

In [5]:
#!pip install nltk
#!pip install numpy scipy scikit-learn

In [7]:
import pyodbc
import pandas as pd

AttributeError: module 'numpy' has no attribute 'ndarray'

In [5]:
cred_path = '../../credentials.txt' #Create credentials.txt on the top of repo directory

connection_str = ''
with open(cred_path) as infile:
    for line in infile:
        connection_str += line.strip('\n')

In [6]:
connection = pyodbc.connect(connection_str)

In [9]:
#Read data tables
df = pd.read_sql(
    'SELECT * FROM dbo.AQ32RACE WHERE Cycle=1 ORDER BY ID', 
    connection
)
df

  df = pd.read_sql(


Unnamed: 0,ID,Q32RACE,AQ32RACE,AQ32RACE_Cleaned,Coding_Comment,Q32RACE_C01,Q32RACE_C02,Q32RACE_C03,Q32RACE_C04,Q32RACE_C05,...,Q32RACE_C08,Q32RACE_C09,Q32RACE_C10,Q32RACE_C11,Q32RACE_C12,Q32RACE_C13,Q32RACE_C14,Q32RACE_C15,Q32RACE_C16,Cycle
0,0000004,97,Canadian,,,20001,,,,,...,,,,,,,,,,1
1,0000007,97,caucassion,,,105,,,,,...,,,,,,,,,,1
2,0000013,97,White,,,105,,,,,...,,,,,,,,,,1
3,0000014,97,other canadian born Chinese,,,1041,,,,,...,,,,,,,,,,1
4,0000047,97,Caucasian,,,105,,,,,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6626,0048496,97,North American,,,97,,,,,...,,,,,,,,,,1
6627,0048514,97,Canadian,,,97,,,,,...,,,,,,,,,,1
6628,0048515,97,Canadian,,,97,,,,,...,,,,,,,,,,1
6629,0051193,97,Canadian,,,97,,,,,...,,,,,,,,,,1


In [10]:
df_code = pd.read_sql(
    'SELECT * FROM dbo.AQ32RACE_Codes', 
    connection
)
df_code

  df_code = pd.read_sql(


Unnamed: 0,ID,Q,Q_LONG,Q_CODE,QC_DESC,ORDERCLASS,QC_DESC_Notes,Additional Notes
0,21,Q32RACE,AQ32RACE,10000,Indeterminate,,,
1,1,Q32RACE,AQ32RACE,101,African or Caribbean,,"Black, Caribbean, African, Afro-American, Afro...",Trinidad
2,2,Q32RACE,AQ32RACE,102,Arab,,"Emirati, Lebanese, North African, Palestinian,...","Amazigh, Berber, Copic Egyptian, Kabyle"
3,3,Q32RACE,AQ32RACE,103,Central Asian,,"Kazakhstani, Kyrgyzstani, Tajikistani, Uzbekis...",
4,5,Q32RACE,AQ32RACE,1041,Chinese,,,
5,6,Q32RACE,AQ32RACE,1042,Japanese,,,
6,7,Q32RACE,AQ32RACE,1043,Korean,,,
7,8,Q32RACE,AQ32RACE,1044,Another East Asian identity,,"Hong Konger, Mongolian, Taiwanese, Tibetan",
8,9,Q32RACE,AQ32RACE,105,European,,"White, Anglo-Saxon, Balkan, French, German, It...","Caucasian, white-passing, Anglo-Saxon, UK, New..."
9,10,Q32RACE,AQ32RACE,106,First Nations,,,


## 2. Preprocess the text

In [12]:
import re
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

AttributeError: module 'numpy.linalg._umath_linalg' has no attribute '_ilp64'

In [None]:
def clean_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove Stopwords
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)  # split string into words (tokens)
    words = [i for i in words if i not in stop_words]  # remove the stopwords
    text = ' '.join(words)  # join the list of words back into a string

    return text

In [None]:
df['AQ32RACE_Cleaned'] = df['AQ32RACE'].apply(clean_text)

## 3. Modeling

Methodology
* Prepare a list of keywords from 'QC_DESC' and 'QC_DESC_Notes'. Preprocess them to a common case (lower or upper) and remove punctuation and extra spaces.
* Preprocess the 'AQ32RACE' column in the same way.
* For each row in 'AQ32RACE', search for each keyword in the text. If find a match, lookup the corresponding code and put it into the next available column (Q32RACE_CXX).

In [None]:
df_code['Q_CODE'] = df_code['Q_CODE'].astype(str)

In [None]:
# Prepare a list of keywords
keywords = pd.concat([df_code['QC_DESC'], df_code['QC_DESC_Notes']]).dropna().str.lower().unique()

In [None]:
keywords

In [None]:
# For each row in 'AQ32RACE_Cleaned', search for each keyword in the text
for i, text in enumerate(df['AQ32RACE_Cleaned']):
    for keyword in keywords:
        if re.search(r'\b' + keyword + r'\b', text):  # Search for the keyword as a whole word
            # Look up the code for the keyword
            code = df_code.loc[(df_code['QC_DESC'].str.lower() == keyword) | (df_code['QC_DESC_Notes'].str.lower() == keyword), 'Q_CODE'].values
            if len(code) > 0:  # If the keyword was found
                # Store the code in the first available column, starting from Q32RACE_C03
                for j in range(0, 16):
                    if pd.isnull(df.loc[i, f'Q32RACE_C{j+1:02}']):
                        df.loc[i, f'Q32RACE_C{j+1:02}'] = code[0]
                        break

In [None]:
pd.set_option('display.max_rows', None)
df.head(1000)

### K-means clustering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# TF-IDF feature matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(df['AQ32RACE_Cleaned'])

# K-means clustering
kmeans = KMeans(n_clusters=50)
kmeans.fit(tfidf)

# Get cluster assignments for each data point
df['cluster'] = kmeans.labels_


In [None]:
df[df['cluster'] == 21] #k-means doesn't take into account of multiple categories for single answer.