## 1. Install packages and Load dataset

In [None]:
#!pip install nltk
#!pip install numpy scipy scikit-learn

In [None]:
import pyodbc
import pandas as pd

In [None]:
cred_path = '../credentials.txt' #Create credentials.txt on the top of repo directory

connection_str = ''
with open(cred_path) as infile:
    for line in infile:
        connection_str += line.strip('\n')

In [None]:
connection = pyodbc.connect(connection_str)

In [None]:
#Read data tables
df = pd.read_sql(
    'SELECT * FROM dbo.AQ32RACE_TEST WHERE Cycle=1 ORDER BY ID', 
    connection
)
df

In [None]:
df_code = pd.read_sql(
    'SELECT * FROM dbo.AQ32RACE_Codes', 
    connection
)
df_code

## 2. Preprocess the text

In [None]:
import re
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
def clean_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove Stopwords
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)  # split string into words (tokens)
    words = [i for i in words if i not in stop_words]  # remove the stopwords
    text = ' '.join(words)  # join the list of words back into a string

    return text

In [None]:
df['AQ32RACE_Cleaned'] = df['AQ32RACE'].apply(clean_text)

## 3. Modeling

Methodology
* Prepare a list of keywords from 'QC_DESC' and 'QC_DESC_Notes'. Preprocess them to a common case (lower or upper) and remove punctuation and extra spaces.
* Preprocess the 'AQ32RACE' column in the same way.
* For each row in 'AQ32RACE', search for each keyword in the text. If find a match, lookup the corresponding code and put it into the next available column (Q32RACE_CXX).

In [None]:
df_code['Q_CODE'] = df_code['Q_CODE'].astype(str)

In [None]:
# Prepare a list of keywords
keywords = pd.concat([df_code['QC_DESC'], df_code['QC_DESC_Notes']]).dropna().str.lower().unique()

In [None]:
keywords

In [None]:
# For each row in 'AQ32RACE_Cleaned', search for each keyword in the text
for i, text in enumerate(df['AQ32RACE_Cleaned']):
    for keyword in keywords:
        if re.search(r'\b' + keyword + r'\b', text):  # Search for the keyword as a whole word
            # Look up the code for the keyword
            code = df_code.loc[(df_code['QC_DESC'].str.lower() == keyword) | (df_code['QC_DESC_Notes'].str.lower() == keyword), 'Q_CODE'].values
            if len(code) > 0:  # If the keyword was found
                # Store the code in the first available column, starting from Q32RACE_C03
                for j in range(0, 16):
                    if pd.isnull(df.loc[i, f'Q32RACE_C{j+1:02}']):
                        df.loc[i, f'Q32RACE_C{j+1:02}'] = code[0]
                        break

In [None]:
pd.set_option('display.max_rows', None)
df.head(1000)

### K-means clustering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# TF-IDF feature matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(df['AQ32RACE_Cleaned'])

# K-means clustering
kmeans = KMeans(n_clusters=50)
kmeans.fit(tfidf)

# Get cluster assignments for each data point
df['cluster'] = kmeans.labels_


In [None]:
df[df['cluster'] == 21] #k-means doesn't take into account of multiple categories for single answer.