# APIs

In [63]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from joblib import Memory

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize, FunctionTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import HDBSCAN

import spacy
from spacy.tokens import DocBin
import random
from spacy.util import minibatch, compounding
from spacy.training.example import Example

import preprocessing_utils

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [3]:
cache_dir = './cache'
memory = Memory(location=cache_dir, verbose=0)

# Data Setup

Training Data

In [5]:
training_data = pd.read_csv('data/train.csv')
training_data.shape

(10189, 2)

Test Data

In [6]:
test_data = pd.read_csv('data/test.csv')
test_data.shape

(3044, 2)

Class Labels

In [31]:
category_name = ['Algebra', 'Geometry', 'Calculus', 'Statistics', 'Number_theory', 'Combinatorics', 'Linear_Algebra', 'Abstract_Algebra']
# category_name = np.array(category_name)

In [32]:
category_val = [0, 1, 2, 3, 4, 5, 6, 7]

Splitting Training Data into train & test sets

In [10]:
text_train, text_test, y_train, y_test = train_test_split(np.array(training_data['Question']), np.array(training_data['label']), 
                                                          random_state=0, stratify=training_data['label'])

In [11]:
print(text_train.shape, y_train.shape,'\n',text_test.shape, y_test.shape)

(7641,) (7641,) 
 (2548,) (2548,)


# Approach 1 - Spacy's TextCategorizer

Load blank model

In [58]:
nlp = spacy.blank("en")
textcat = nlp.add_pipe('textcat')

Adding text categorizer

In [60]:
for i in category_name:
    textcat.add_label(i)

function to get training data in required format

In [61]:
def get_data_spacy_fmt(text_array, y_array):
    y_bin = label_binarize(y=y_array, classes=category_val)
    result = []
    for i, (ques, labels) in enumerate(zip(text_array, y_bin)):
        label_dict = {}
        label_dict.update({category_name[0]: int(labels[0]), category_name[1]: int(labels[1]),
                           category_name[2]: int(labels[2]), category_name[3]: int(labels[3]),
                           category_name[4]: int(labels[4]), category_name[5]: int(labels[5]),
                           category_name[6]: int(labels[6]), category_name[7]: int(labels[7])})
        # print((ques, {'cats': label_dict}))
        result.append((ques, {'cats': label_dict}))
    return result
    # print(result)    

In [62]:
text_train_spacy = get_data_spacy_fmt(text_train, y_train)

Convert to Example objects

In [64]:
examples = []
for text, ann in text_train_spacy:
    doc = nlp.make_doc(text)
    examples.append(Example.from_dict(doc, ann))

Initialize the pipeline

In [65]:
nlp.initialize(lambda: examples)

<thinc.optimizers.Optimizer at 0x7fcc2a997600>

## Training Model

In [66]:
for epoch in range(10):
    random.shuffle(examples)
    losses = {}
    batches = minibatch(examples, size=compounding(4, 32, 1.5))
    for batch in batches:
        nlp.update(batch, losses=losses)
    print(f'Epoch {epoch} Losses:', losses)

Epoch 0 Losses: {'textcat': 14.926491281017661}
Epoch 1 Losses: {'textcat': 8.25029822718352}
Epoch 2 Losses: {'textcat': 4.864947982830927}
Epoch 3 Losses: {'textcat': 2.6307922412233893}
Epoch 4 Losses: {'textcat': 1.7937833745272656}
Epoch 5 Losses: {'textcat': 1.2696381553037526}
Epoch 6 Losses: {'textcat': 0.9274789209684684}
Epoch 7 Losses: {'textcat': 0.8745670416837399}
Epoch 8 Losses: {'textcat': 0.7566490123098468}
Epoch 9 Losses: {'textcat': 0.6602426764256863}


In [67]:
nlp.to_disk('nlp_model/math_textcat_model')

## Testing the Model

In [68]:
math_nlp = spacy.load('nlp_model/math_textcat_model')

In [87]:
pred_val = []
for ques in text_test:
    doc = math_nlp(ques)
    pred_label = max(doc.cats, key=doc.cats.get)
    pred_val.append(pred_label)

In [82]:
label_map = {}
for label, name in enumerate(category_name):
    label_map.update({name: label})
label_map

{'Algebra': 0,
 'Geometry': 1,
 'Calculus': 2,
 'Statistics': 3,
 'Number_theory': 4,
 'Combinatorics': 5,
 'Linear_Algebra': 6,
 'Abstract_Algebra': 7}

In [89]:
val_df = pd.DataFrame({
    'Question' : text_test,
    'Pred_label' : pred_val
})

In [93]:
val_df['pred_value'] = val_df['Pred_label'].map(label_map)

In [94]:
math_nlp_score = f1_score(y_pred=val_df['pred_value'], y_true=y_test, average='micro')
print(f'f1-micro score using Spacy Model on Validation set: {math_nlp_score:.3f}')

f1-micro score using Spacy Model on Validation set: 0.763


In [100]:
# print(classification_report(y_pred=val_df['pred_value'], y_true=y_test))
# ConfusionMatrixDisplay.from_predictions(y_test, val_df['pred_value'])

# Training on pre-trained model

python -m spacy train ./math_nlp_2/config.cfg \
  --training.train_corpus ./data/train.spacy \
  --training.dev_corpus ./data/dev.spacy \
  --output ./math_nlp_2/output