# APIs

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import re
from joblib import Memory

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import HDBSCAN

import preprocessing_utils

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
cache_dir = './cache'
memory = Memory(location=cache_dir, verbose=0)

# Data Setup

Training Data

In [4]:
training_data = pd.read_csv('data/train.csv')
training_data.shape

(10189, 2)

Test Data

In [6]:
test_data = pd.read_csv('data/test.csv')
test_data.shape

(3044, 2)

Class Labels

In [7]:
category = ['Algebra', 'Geometry', 'Calculus', 'Statistics', 'Number_theory', 'Combinatorics', 'Linear_Algebra', 'Abstract_Algebra']

Splitting Training Data into train & test sets

In [8]:
text_train, text_test, y_train, y_test = train_test_split(np.array(training_data['Question']), np.array(training_data['label']), 
                                                          random_state=0, stratify=training_data['label'])

In [11]:
print(text_train.shape, y_train.shape,'\n',text_test.shape, y_test.shape)

(7641,) (7641,) 
 (2548,) (2548,)


# Approach 1 - Spacy's TextCategorizer