## Running this note book

There are 2 types of data collection "scripts" here, which are further annotated in markdown cells in the notebook:
- "One-off": Runs one combination of parameters. Good for quick testing.
- "Combination": Runs multiple combinations of parameters. Good for overnight testing.

Other than that, simply make sure that `fulltrain.csv` and `balancedtest.csv` files from the dataset are in the same folder as this notebook. (or, you can edit the variables `FULL_TRAIN/TEST_PATH` down below)

# Preprocessor class


## Install packages

## Import libs

In [2]:
import re, time
import pandas as pd
import numpy as np
import nltk
import spacy
import pickle
import contractions

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB

from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from zeugma.embeddings import EmbeddingTransformer
from bs4 import BeautifulSoup
from spacy.vectors import Vectors

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

## Preprocessor class

In [4]:
from sklearn.base import TransformerMixin, BaseEstimator

class GloveVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, CustomVectorizer,nlp):
        self.dim = 300
        self.CustomVectorizer=CustomVectorizer
        self.nlp=nlp

    def fit(self, X, y):
        return self

    def transform(self, X):
        embeddings=np.zeros((X.shape[0],self.dim))
        
        features = self.CustomVectorizer.get_feature_names_out()
        
        for i in range(0, X.shape[0]):
            feature_weights = X[i]
            more_than_zero_idx = np.nonzero(feature_weights > 0.0)
            
            feature_weights = feature_weights[more_than_zero_idx]
            present_features = features[more_than_zero_idx[1]]
            present_features_vecs = [
                self.nlp.vocab.get_vector(token) * (feature_weights[0, idx]) for idx, token in enumerate(present_features) if self.nlp.vocab.has_vector(token)
            ]
            
            if len(present_features_vecs) > 0:
                embeddings[i] = np.mean(present_features_vecs, axis=0)

        return embeddings

class CustomVectorizer(CountVectorizer):
    def __init__(self,lemmatize, stemming, keep_punctuation, keep_whitespace, merge_entities,sw_removal, nlp, **kwargs):
        super().__init__(**kwargs)
        self.lemmatize=lemmatize
        self.stemming=stemming
        self.keep_punctuation=keep_punctuation
        self.keep_whitespace=keep_whitespace
        self.merge_entities=merge_entities
        self.sw_removal=sw_removal
        self.sw=stopwords.words("english")
        self.nlp=nlp
        
    def tokenize(self,doc):
        # Lemmatizer/Tokenizer
        sp=self.nlp
        ps = PorterStemmer()

        # Clean text
        case_folded=doc.lower() # Case folding
        tags_removed = BeautifulSoup(case_folded, 'lxml').get_text() # Removed html tags
        doc_clean = contractions.fix(tags_removed) # Expand contractions

        # Merge entities
        # sp.add_pipe("merge_noun_chunks")
        #if self.merge_entities:
        #  sp.add_pipe('merge_entities')
        
        # Tokenize
        tokens = sp(doc_clean)
        # Remove punctuation/whitespace
        if (not self.keep_punctuation) or (not self.keep_whitespace):
          tokens=[token for token in tokens if not ((token.is_punct and not self.keep_punctuation) or (token.is_space and not self.keep_whitespace))]
        
        if self.sw_removal:
            tokens = [token for token in tokens if token.text not in self.sw]
        
        # Lemmatization/Stemming
        if self.lemmatize:
          tokens = [token.lemma_ for token in tokens]
        elif self.stemming:
          tokens = [ps.stem(token.text) for token in tokens]
        else:
          tokens = [token.text for token in tokens]
        
        return tokens

    def build_analyzer(self):
        def analyser(doc):
            res=self.tokenize(doc)
            return(self._word_ngrams([token for token in res]))
        return(analyser)

class Preprocessor:
    """
    embedding(bool): Use glove embedding of length=25
    tf_scaling(bool): Use term-frequency scaling
    idf_scaling(bool): Use inverse document frequency scaling
    sw_removal(bool): Remove stop words
    lemmatize(bool): lemmatize
    stemming(bool): stem
    ngram_range(bool): values of n used when forming features
    topk(bool): Value of k using top k feature selection. Setting a value will indicate using feature selection
    keep_punctuation(bool): keep punctuation
    keep_whitespace(bool): keep whitespaces
    merge_entities(bool): Merge named entities e.g. "Empire State Building", "New York Times"
    topic_modelling(np.array): path to data_with_topicmodellingfeatures.csv file (in CS4248_G03 drive)
    """
    def __init__(self, embedding=False, tf_scaling=False, idf_scaling=False, sw_removal=False, 
                 lemmatize=False, stemming=False, ngram_range=(1,2), topk=None, keep_punctuation=False
                 ,keep_whitespace=False, merge_entities=False, topic_modelling=None, binary=False):
        self.tf_scaling = tf_scaling
        self.idf_scaling = idf_scaling
        self.sw_removal = sw_removal
        self.lemmatize = lemmatize
        self.stemming=stemming
        self.topk = topk
        self.keep_punctuation=keep_punctuation
        self.embedding=embedding
        self.keep_whitespace=keep_whitespace
        self.merge_entities=merge_entities
        self.topic_modelling=topic_modelling
        if self.topic_modelling is not None:
            self.tm_feats=pd.read_csv(topic_modelling).iloc[: , -6:]
        self.nlp = spacy.load('en_core_web_lg')
        
        if True:
            if not lemmatize:
                self.nlp.disable_pipes('lemmatizer')

            if not merge_entities:
                if not lemmatize:
                    self.nlp.disable_pipes('tagger', 'attribute_ruler')
                self.nlp.disable_pipes('parser', 'ner')

        # Initialize preprocessing pipeline objects
        self.count_vectorizer=('count vectorizer',CustomVectorizer(sw_removal=self.sw_removal, ngram_range=ngram_range,lemmatize=self.lemmatize, 
                                                                   stemming=self.stemming,keep_punctuation=self.keep_punctuation,
                                                                   keep_whitespace=self.keep_whitespace, merge_entities=self.merge_entities, binary=binary,
                                                                   nlp=self.nlp)) # Count
        self.k_best=('chi2score',SelectKBest(chi2,k=topk)) # topk
        self.tf_idf_transformer=('tf_transformer',TfidfTransformer(use_idf=idf_scaling)) #TF-IDF
        
        # Pipeline
        steps=[]
        steps.append(self.count_vectorizer)
        if not self.topk is None:
          steps.append(self.k_best)
        if self.tf_scaling:
          steps.append(self.tf_idf_transformer)
        if self.embedding:
          self.glove=('glove',GloveVectorTransformer(self.count_vectorizer[1],nlp=self.nlp))
          steps.append(self.glove)
        print(steps)
        self.model = Pipeline(steps)

    def fit(self, X, y):
        return self.model.fit(X, y)
    def transform(self, X):
        res=self.model.transform(X)
        if not self.topic_modelling is None:
            res=np.concatenate((res, self.tm_feats), axis=1)
        return res
    def fit_transform(self, X, y):
        res=self.model.fit_transform(X,y)
        if not self.topic_modelling is None:
            print(self.tm_feats)
            res=np.concatenate((res, self.tm_feats), axis=1)
        return res

## Edit the preprocessor / feature engineering scheme settings for the "One-off" scripts here

In [5]:
# Edit if running locally
FULLTRAIN_PATH = 'fulltrain.csv'
BALANCED_TEST_PATH = 'balancedtest.csv'

# Loading data from csv file
data = pd.read_csv(FULLTRAIN_PATH, header=None)
X_train = data[1]
Y_train = data[0]

# Same tokenization transforms for test.
test_data = pd.read_csv(BALANCED_TEST_PATH, header=None)
X_test = test_data[1]
Y_test = test_data[0]

p = Preprocessor(
    embedding=False,
    tf_scaling=True,
    idf_scaling=True,
    sw_removal=True,
    lemmatize=True,
    stemming=False,
    ngram_range=(1,1),
    topk=20000,         # 20000
    keep_punctuation=False,
    keep_whitespace=False,
    merge_entities=False,
    binary=False
)
print(p.nlp.pipeline)

[('count vectorizer', CustomVectorizer(keep_punctuation=False, keep_whitespace=False, lemmatize=True,
                 merge_entities=False,
                 nlp=<spacy.lang.en.English object at 0x000001E142448850>,
                 stemming=False, sw_removal=True)), ('chi2score', SelectKBest(k=20000, score_func=<function chi2 at 0x000001E140BAD0D0>)), ('tf_transformer', TfidfTransformer())]
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x000001E142182DC0>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x000001E1421822E0>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x000001E15AB4E880>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x000001E15AB40100>)]


In [6]:
Y_train_pred = None
Y_test_pred = None

train_X_vec = p.fit_transform(X_train, Y_train)
# train_X_vec = p.fit_transform(X_test, Y_test)
test_X_vec = p.transform(X_test)

if not isinstance(train_X_vec, np.ndarray):
    train_X_vec = train_X_vec.toarray()
if not isinstance(test_X_vec, np.ndarray):
    test_X_vec = test_X_vec.toarray()

In [7]:
print('Starting models...')
train_X_vec.shape  # verify shape

Starting models...


(48854, 20000)

In [5]:
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
def evaluate(X, Y):
    print(f'Accuracy: {accuracy_score(Y, X)}')
    print(f'Precision: {precision_score(Y, X, average=None)}')
    print(f'Recall: {recall_score(Y, X, average=None)}')
    print(f'F1: {f1_score(Y, X, average=None)}')

## "One-off" scripts

The three cells below are the mentioned "one-off" scripts.

In [None]:
# -------------------------
# Statistical models

# model 1:-
# Using linear support vector classifier

# GridSearchCV causes OOm

for C in [0.01, 0.05, 0.1, 0.3, 0.5, 0.75, 1, 1.5]:#, 2.0, 2.5, 5.0, 10.0]:
    Y_train_pred = None
    Y_test_pred = None

    # Manually set parameters
    lsvc = LinearSVC(max_iter=100000, dual=False, C=C, penalty='l2')

    # training the model
    lsvc.fit(train_X_vec, Y_train)
    Y_train_pred = lsvc.predict(train_X_vec)
    Y_test_pred = lsvc.predict(test_X_vec)

    print(C)
    evaluate(Y_train_pred, Y_train)
    evaluate(Y_test_pred, Y_test)



In [None]:
# model 2:-
# Using Naive Bayes
Y_train_pred = None
Y_test_pred = None

gnb = GaussianNB(var_smoothing=1e-9)
gnb.fit(train_X_vec, Y_train)
Y_train_pred = gnb.predict(train_X_vec)
Y_test_pred = gnb.predict(test_X_vec)
evaluate(Y_train_pred, Y_train)
evaluate(Y_test_pred, Y_test)

In [None]:
alphas = [1e-10, 0.05, 0.1, 0.25, 0.5, 1.0, 1.25, 1.5]

Y_train_pred = None
Y_test_pred = None

for alpha in alphas:
    print(alpha)
    mnb = MultinomialNB(alpha=alpha)
    mnb.fit(train_X_vec, Y_train)
    Y_train_pred = mnb.predict(train_X_vec)
    Y_test_pred = mnb.predict(test_X_vec)
    evaluate(Y_train_pred, Y_train)
    evaluate(Y_test_pred, Y_test)

## Combination "script"

This cell simply provides a convenient "script" to run multiple combinations of parameters at once. (good for overnight testing)


In [None]:
from sklearn.naive_bayes import MultinomialNB

FULLTRAIN_PATH = 'fulltrain.csv'
BALANCED_TEST_PATH = 'balancedtest.csv'

params = [
    dict(
        embedding=False,
        tf_scaling=False,
        idf_scaling=False,
        sw_removal=False,
        lemmatize=False,
        stemming=False,
        ngram_range=(1,1),
        topk=10000,         # 20000
        keep_punctuation=True,
        keep_whitespace=False,
        merge_entities=False
    ),
    dict(
        embedding=False,
        tf_scaling=False,
        idf_scaling=False,
        sw_removal=True,
        lemmatize=False,
        stemming=False,
        ngram_range=(1,1),
        topk=10000,         # 20000
        keep_punctuation=True,
        keep_whitespace=False,
        merge_entities=False
    ),
    dict(
        embedding=False,
        tf_scaling=False,
        idf_scaling=False,
        sw_removal=True,
        lemmatize=False,
        stemming=False,
        ngram_range=(1,1),
        topk=10000,         # 20000
        keep_punctuation=False,
        keep_whitespace=False,
        merge_entities=False
    ),
    dict(
        embedding=False,
        tf_scaling=False,
        idf_scaling=False,
        sw_removal=True,
        lemmatize=True,
        stemming=False,
        ngram_range=(1,1),
        topk=10000,         # 20000
        keep_punctuation=False,
        keep_whitespace=False,
        merge_entities=False
    ),
    dict(
        embedding=False,
        tf_scaling=False,
        idf_scaling=False,
        sw_removal=True,
        lemmatize=False,
        stemming=True,
        ngram_range=(1,1),
        topk=10000,         # 20000
        keep_punctuation=False,
        keep_whitespace=False,
        merge_entities=False
    ),
    dict(
        embedding=False,
        tf_scaling=False,
        idf_scaling=False,
        sw_removal=True,
        lemmatize=False,
        stemming=False,
        ngram_range=(1,1),
        topk=10000,         # 20000
        keep_punctuation=False,
        keep_whitespace=False,
        merge_entities=True
    )
]

for param_dict in params:

    # Loading data from csv file
    data = pd.read_csv(FULLTRAIN_PATH, header=None)
    X_train = data[1]
    Y_train = data[0]

    # Same tokenization transforms for test.
    test_data = pd.read_csv(BALANCED_TEST_PATH, header=None)
    X_test = test_data[1]
    Y_test = test_data[0]

    print(param_dict)
    p = Preprocessor(**param_dict)
    print(p.nlp.pipeline)

    Y_train_pred = None
    Y_test_pred = None

    train_X_vec = p.fit_transform(X_train, Y_train)
    # train_X_vec = p.fit_transform(X_test, Y_test)
    test_X_vec = p.transform(X_test)

    if not isinstance(train_X_vec, np.ndarray):
        train_X_vec = train_X_vec.toarray()
    if not isinstance(test_X_vec, np.ndarray):
        test_X_vec = test_X_vec.toarray()
        
    print('SVC')
    
    # model 1:-
    # Using linear support vector classifier

    # GridSearchCV causes OOm

    for C in [0.1, 0.3, 0.5, 0.75, 1, 1.5]:#, 2.0, 2.5, 5.0, 10.0]:
        Y_train_pred = None
        Y_test_pred = None

        # Manually set parameters
        lsvc = LinearSVC(max_iter=100000, dual=False, C=C, penalty='l2')

        # training the model
        lsvc.fit(train_X_vec, Y_train)
        Y_train_pred = lsvc.predict(train_X_vec)
        Y_test_pred = lsvc.predict(test_X_vec)

        print(C)
        evaluate(Y_train_pred, Y_train)
        evaluate(Y_test_pred, Y_test)
    
    # Switch to turn off NB
    if True:
        # model 2:-
        # Using Naive Bayes
        print('GaussianNB')
        Y_train_pred = None
        Y_test_pred = None

        gnb = GaussianNB(var_smoothing=1e-9)
        gnb.fit(train_X_vec, Y_train)
        Y_train_pred = gnb.predict(train_X_vec)
        Y_test_pred = gnb.predict(test_X_vec)
        evaluate(Y_train_pred, Y_train)
        evaluate(Y_test_pred, Y_test)


        alphas = [1e-10, 0.05, 0.1, 0.25, 0.5, 1.0]

        Y_train_pred = None
        Y_test_pred = None

        for alpha in alphas:
            print(alpha)
            mnb = MultinomialNB(alpha=alpha)
            mnb.fit(train_X_vec, Y_train)
            Y_train_pred = mnb.predict(train_X_vec)
            Y_test_pred = mnb.predict(test_X_vec)
            evaluate(Y_train_pred, Y_train)
            evaluate(Y_test_pred, Y_test)

# Below this cell are things we "tried"

But decided ultimately not to analyse further in the report
- Non linear SVMs - severe performance issues
- Random forest / decision trees - while interesting, we think what we want to investigate here is largely done by Naive Bayes alone (conditional independence of features). Naive Bayes uses multiplication of likelihoods, thus, it also at least **somewhat** mimics how a decision tree favours "distinctive" (most reduction in entropy) features first.
- Vector Space Model (Cosine Classifier) - this is by and large a conditionally independent model, analysable by Naive Bayes as well.

In [None]:
# Non-linear basis SVM
# Very, very slow! O(samples^2 + n_features)
# Still running at 6h....

USE_GRID_SEARCH_CV=False
if USE_GRID_SEARCH_CV:
    params = {'C':[10e-3, 10e-2, 0.1, 1,10], 'kernel':('poly', 'rbf', 'sigmoid'), 'decision_function_shape':('ovr', 'ovo')}
    svc = GridSearchCV(SVC(max_iter=100000), params)
else:
    # Manually set parameters
    svc = SVC(max_iter=100000)

# Train    
svc.fit(train_X_vec, Y_train)
Y_train_pred = lsvc.predict(train_X_vec)
Y_test_pred = lsvc.predict(test_X_vec)

if USE_GRID_SEARCH_CV:
    print(svc.cv_results_)
    print(svc.best_params_)

evaluate(Y_train_pred, Y_train)
evaluate(Y_test_pred, Y_test)

In [None]:
# model 3:-
# Random Forest Classifier
USE_GRID_SEARCH_CV=False
if USE_GRID_SEARCH_CV:
    params = {'n_estimators':[10,50,100,150,200,500],'criterion':('gini', 'entropy'), 'max_depth':[k for k in range(1,21)]}
    rf = GridSearchCV(RandomForestClassifier(random_state=7), params)
else:
    rf = RandomForestClassifier(random_state=7)

# Train, predict
rf.fit(train_X_vec, Y_train)
Y_train_pred = rf.predict(train_X_vec)
Y_test_pred = rf.predict(test_X_vec)

if USE_GRID_SEARCH_CV:
    print(rf.cv_results_)
    print(rf.best_params_)

evaluate(Y_train_pred, Y_train)
evaluate(Y_test_pred, Y_test)

In [None]:
print(rf.cv_results_)
print(rf.best_params_)

In [None]:
# model 4:-
# Vector Space Model (Cosine) Classifier
from sklearn.metrics.pairwise import cosine_similarity
class VSMClf:
    def __init__(self, top_k=None):
        self.top_k = top_k
        self.labels = None
        self.mean_feature_vecs = []

    def fit(self, X_train, Y_train):
        num_features = X_train.shape[1]

        Y_train = Y_train.to_numpy()

        self.labels = np.unique(Y_train)
        self.mean_feature_vecs = []
        for i in range(0, self.labels.shape[0]):
            label_indices = np.argwhere(Y_train == self.labels[i])
            label_features = X_train[label_indices]
            self.mean_feature_vecs.append(np.mean(label_features, axis=0))

    def predict(self, X_test):
        similarities = []
        for mean_feature_vec in self.mean_feature_vecs:
            similarities.append(cosine_similarity(X_test, mean_feature_vec))
        similarities = np.array(similarities)

        predicted_class_indices = np.argmax(similarities, axis=0)
        predicted_classes = self.labels[predicted_class_indices]

        return predicted_classes

vsm = VSMClf(top_k=10)
vsm.fit(train_X_vec, Y_train)
Y_train_pred = vsm.predict(train_X_vec)
Y_test_pred = vsm.predict(test_X_vec)

evaluate(Y_train_pred, Y_train)
evaluate(Y_test_pred, Y_test)
