# Feedback tagging - modelling

In [None]:
from IPython.core.interactiveshell import InteractiveShell
from functools import partial
from gensim.sklearn_api import D2VTransformer, W2VTransformer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from src import calculate_metrics, tagging_preprocessing
from src.make_feedback_tagging.calculate_performance_metrics import FUNCS_METRICS_PRED
import multiprocessing as mp
import numpy as np
import os
import pandas as pd
import re
import xgboost as xgb

# Enable multiple cell outputs
InteractiveShell.ast_node_interactivity = "all"

## Define variables

Define some environmental and constant variables.

In [None]:
# Extract environment variables
DIR_DATA_RAW = os.getenv("DIR_DATA_RAW")

# Define the raw data file path
FILE_RAW = os.path.join(DIR_DATA_RAW, "20200521 Coronavirus feedback analysis - Tagging sheet.csv")

# Define the column containing the free text for analysis
COLS_FREE_TEXT = ["q3", "q8"]

# Define the number of processors in this machine
COUNT_CPU = mp.cpu_count()

# Define metric functions that use prediction probabilities
FUNCS_METRICS_PROB = [partial(roc_auc_score, multi_class="ovr")]

## Import and process data

Import the CSV data, and pre-process it in preparation for model build.

In [None]:
# Import the raw data file as a pandas DataFrame
df_raw = pd.read_csv(FILE_RAW)
df_raw.shape

In [None]:
# Check tag counts
df_raw["This response relates to..."].value_counts(dropna=False)
df_raw["Coronavirus Theme"].value_counts(dropna=False)

In [None]:
# Process the dataset
df_process = tagging_preprocessing(df_raw, COLS_FREE_TEXT)
df_process.shape
df_process.head()

## Baseline modelling - `this_response_relates_to_`

In [None]:
def remove_null_and_single_values(df: pd.DataFrame, col: str) -> pd.DataFrame:
    """Remove NULL, and single-occurrence values from a column of a pandas DataFrame.

    :param df: A pandas DataFrame.
    :param col: A column of in `df`.
    :return: A filtered version of `df`, where NULL entries in `col` are removed, as well as any values that only 
        appear once in `col`.

    """
    return df[df[col].notnull()].groupby(col).filter(lambda x: len(x) > 1)

In [None]:
# Define the tag we will model first
col_target = "this_response_relates_to_"

# Remove NULL and single occurrence values from `col_target`, and convert it into a category
df_model = remove_null_and_single_values(df_process, col_target)
df_model = df_model.assign(**{col_target: df_model[col_target].astype("category")})

# Print the encoded labels
_ = [print(f"{tag}: {ix}") for ix, tag in enumerate(df_model[col_target].cat.categories)]

# Split the dataset into training and test sets
lemma_X_train, lemma_X_test, lemma_y_train, lemma_y_test = train_test_split(
    df_model["lemma"].values,
    df_model[col_target].cat.codes.values,
    test_size=0.3,
    random_state=42, 
    stratify=df_model[col_target]
)

### Multinominal Naive Bayes

In [None]:
# Build a model pipeline
mnb_pipeline = Pipeline([("tfidf", TfidfVectorizer()),
                           ("clf", MultinomialNB())])

# Fit the model
_ = mnb_pipeline.fit(lemma_X_train, lemma_y_train)

# Predict the model
mnb_y_pred = mnb_pipeline.predict(lemma_X_test)
mnb_y_pred_prob = mnb_pipeline.predict_proba(lemma_X_test)

# Calculate the metrics
mnb_metrics = calculate_metrics(lemma_y_test, mnb_y_pred, mnb_y_pred_prob, FUNCS_METRICS_PRED, FUNCS_METRICS_PROB)

### XGBoost

In [None]:
# Instantiate a TfidfVectorizer object
xgb_tfidf = TfidfVectorizer()

# Transform both the training and test sets
xgb_X_train = xgb_tfidf.fit_transform(lemma_X_train, lemma_y_train)
xgb_X_test = xgb_tfidf.transform(lemma_X_test, lemma_y_test)

# Instantiate an XGBClassifier object
xgb_clf = xgb.XGBClassifier(objective="multi:softmax", seed=42)

# Fit `xgb_clf` using a early stopping
_ = xgb_clf.fit(xgb_X_train, lemma_y_train,
                eval_set=[(xgb_X_train, lemma_y_train), (xgb_X_test, lemma_y_test)],
                eval_metric="mlogloss", early_stopping_rounds=10)

# Predict the model
xgb_y_pred = xgb_clf.predict(xgb_X_test)
xgb_y_pred_prob = xgb_clf.predict_proba(xgb_X_test)

# Calculate the metrics
xgb_metrics = calculate_metrics(lemma_y_test, xgb_y_pred, xgb_y_pred_prob, FUNCS_METRICS_PRED, FUNCS_METRICS_PROB)

### XGBoost with Grid Search

Warning - can take a long time!

In [None]:
# Instantiate an XGBClassifier object
xgb_gs_clf = xgb.XGBClassifier(objective="multi:softmax", seed=42)


# Define a parameter set for the grid search
xgb_grid_params = {"learning_rate": [0.01, 0.1, 0.5, 0.9],
                   "n_estimators": [200],
                   "subsample": [0.3, 0.5, 0.9]}

# Build a grid search
xgb_grid = GridSearchCV(estimator=xgb_gs_clf, param_grid=xgb_grid_params, cv=5, n_jobs=-1,
                        scoring=make_scorer(matthews_corrcoef), verbose=2)

# Fit the model
_ = xgb_grid.fit(xgb_X_train, lemma_y_train,
                 eval_set=[(xgb_X_train, lemma_y_train), (xgb_X_test, lemma_y_test)],
                 eval_metric="mlogloss", early_stopping_rounds=10)

# Predict the model
xgb_gs_y_pred = xgb_grid.predict(xgb_X_test)
xgb_gs_y_pred_prob = xgb_grid.predict_proba(xgb_X_test)

# Calculate the metrics
xgb_gs_metrics = calculate_metrics(lemma_y_test, xgb_gs_y_pred, xgb_gs_y_pred_prob, FUNCS_METRICS_PRED,
                                   FUNCS_METRICS_PROB)

### word2vec

In [None]:
# Tokenise each piece of feedback to use as the feature set; feedback used here is text that has been stripped of 
# personally identifiable information (PII), in lowercase, with all free text columns `COLS_FREE_TEXT` compiled 
# into a single string, and all stopwords and certain symbols removed. Symbols removed are (, ), [, ], +, and *
clean_X = df_model["clean_text"].map(word_tokenize).values

# Split the dataset into training and test sets
clean_X_train, clean_X_test, clean_y_train, clean_y_test = train_test_split(
    clean_X,
    df_model[col_target].cat.codes.values,
    test_size=0.3,
    random_state=42,
    stratify=df_model[col_target]
)

#### doc2vec

In [None]:
# Build a model pipeline
d2v_pipeline = Pipeline([("d2v", D2VTransformer(dm=0, size=300, seed=42, workers=COUNT_CPU)),
                         ("clf", LogisticRegression(max_iter=200, n_jobs=-1, random_state=42))])

# Define a parameter set for the grid search
d2v_grid_params = {"d2v__size": [100, 200, 300],
                   "clf__C": np.logspace(-5, 5, 20),
                   "clf__penalty": ["l1", "l2"]}

# Build a grid search
d2v_grid = GridSearchCV(estimator=d2v_pipeline, param_grid=d2v_grid_params, cv=5, n_jobs=-1,
                        scoring=make_scorer(matthews_corrcoef), verbose=2)

# Fit the grid search
_ = d2v_grid.fit(clean_X_train, clean_y_train)

# Predict the model
d2v_gs_y_pred = d2v_grid.predict(clean_X_test)
d2v_gs_y_pred_prob = d2v_grid.predict_proba(clean_X_test)

# Calculate the metrics
d2v_gs_metrics = calculate_metrics(clean_y_test, d2v_gs_y_pred, d2v_gs_y_pred_prob, FUNCS_METRICS_PRED,
                                   FUNCS_METRICS_PROB)