In [9]:
import importlib

trainFileName = './data/News Classification Dataset/train.csv'
testFileName = './data/News Classification Dataset/test.csv'

# Training Embedding Models
## Word2Vec

In [6]:
import word_vectorization.models.word2vec.Word2Vec as Word2Vec
importlib.reload(Word2Vec)

word2VecModel = Word2Vec.Word2Vec(2, trainFileName, embeddingSize=300, k=3)
word2VecEmbeddings = word2VecModel.train(epochs=10, lr=0.005, batchSize=2**12, verbose=True)

Loaded training data. Vocabulary size: 43605
Found and loaded trained embeddings.


## SVD

In [10]:
import word_vectorization.models.svd.SVD as SVD
importlib.reload(SVD)

svdModel = SVD.SvdWordVectorizationModel(4, trainFileName, embeddingSize=300)
svdEmbeddings = svdModel.train(retrain=True)

Loaded training data. Vocabulary size: 43605
Embeddings not found. Starting training from scratch.
Computed co-occurence matrix.
Computed Partial Singular Value Decomposition of the co-occurence matrix.


In [13]:
svdModel = SVD.SvdWordVectorizationModel(2, trainFileName, embeddingSize=300)
svdEmbeddings = svdModel.train(retrain=True)

Loaded training data. Vocabulary size: 43605
Embeddings not found. Starting training from scratch.
Computed co-occurence matrix.
Computed Partial Singular Value Decomposition of the co-occurence matrix.


In [14]:
svdModel = SVD.SvdWordVectorizationModel(2, trainFileName, embeddingSize=300)
svdEmbeddings = svdModel.train()

Loaded training data. Vocabulary size: 43605
Model already trained. Embeddings loaded.


In [15]:
svdModel = SVD.SvdWordVectorizationModel(3, trainFileName, embeddingSize=300)
svdEmbeddings = svdModel.train(retrain=True)

Loaded training data. Vocabulary size: 43605
Embeddings not found. Starting training from scratch.
Computed co-occurence matrix.
Computed Partial Singular Value Decomposition of the co-occurence matrix.


In [16]:
svdModel = SVD.SvdWordVectorizationModel(3, trainFileName, embeddingSize=300)
svdEmbeddings = svdModel.train()

Loaded training data. Vocabulary size: 43605
Model already trained. Embeddings loaded.


In [17]:
svdModel = SVD.SvdWordVectorizationModel(4, trainFileName, embeddingSize=300)
svdEmbeddings = svdModel.train()

Loaded training data. Vocabulary size: 43605
Model already trained. Embeddings loaded.


# Sentence Classification


In [11]:
import word_vectorization.classification.LstmClassifier as LstmClassifier
import word_vectorization.datasets.ClassificationDataset as ClassificationDataset
importlib.reload(LstmClassifier)
importlib.reload(ClassificationDataset)

classifierHyperParams = {'hiddenSize': 256, 'numLayers': 3, 'bidirectional': True, 'hiddenLayers': [128, 64], 'activation': 'tanh'}

## Word2Vec
We will use the embeddings obtained from Word2Vec for classification.

In [None]:
word2VecClassifier = LstmClassifier.SentenceClassifier(trainFileName,
                                                       word2VecModel,
                                                       **classifierHyperParams)
word2VecClassifier.train(epochs=4,
                         lr=0.001,
                         batchSize=32,
                         verbose=True)

Saved model not found or retrain flag is set. Starting training from scratch.


Training:  34%|███▍      | 1273/3750 [09:32<15:41,  2.63it/s]

## SVD
We will use the embeddings obtained from SVD for classification.

In [18]:
svdClassifier = LstmClassifier.SentenceClassifier(trainFileName,
                                                  svdModel,
                                                  **classifierHyperParams)
svdClassifier.train(epochs=4,
                    lr=0.001,
                    batchSize=32,
                    verbose=True)

Saved model not found or retrain flag is set. Starting training from scratch.


Training: 100%|██████████| 3750/3750 [22:25<00:00,  2.79it/s]


Epoch 1/4 | Loss: 1.389


Training: 100%|██████████| 3750/3750 [24:09<00:00,  2.59it/s]


Epoch 2/4 | Loss: 1.389


Training: 100%|██████████| 3750/3750 [23:48<00:00,  2.63it/s]


Epoch 3/4 | Loss: 1.389


Training: 100%|██████████| 3750/3750 [23:58<00:00,  2.61it/s]

Epoch 4/4 | Loss: 1.389





# Testing

In [None]:
trainDataset = ClassificationDataset.ClassificationDataset(trainFileName, word2VecModel.wordIndices)
testDataset = ClassificationDataset.ClassificationDataset(testFileName, word2VecModel.wordIndices)

NameError: name 'ClassificationDataset' is not defined

## Word2Vec Classifier
### Test set evaluation

In [None]:
word2VecScores = word2VecClassifier.evaluate(testDataset)
print(word2VecScores['Report'])

              precision    recall  f1-score   support

           0       0.93      0.89      0.91      1900
           1       0.96      0.97      0.96      1900
           2       0.87      0.85      0.86      1900
           3       0.84      0.89      0.87      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600



### Train set evaluation

## SVD Classifier

In [None]:
svdScores = svdClassifier.evaluate(testDataset)
print(svdScores['Report'])

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1900
           1       0.00      0.00      0.00      1900
           2       0.00      0.00      0.00      1900
           3       0.25      1.00      0.40      1900

    accuracy                           0.25      7600
   macro avg       0.06      0.25      0.10      7600
weighted avg       0.06      0.25      0.10      7600



# Hyperparameter Tuning
We will try different context sizes for each model.

## Training word embeddings

In [None]:
contextSizes = [ 2, 3, 4, 5 ]

word2VecModels = []
svdModels = []

for contextSize in contextSizes:
    w2vModel = Word2Vec.Word2Vec(contextSize, trainFileName, embeddingSize=300, k=3)
    w2vEmbeddings = w2vModel.train(epochs=10, lr=0.005, batchSize=2**12, verbose=True)
    word2VecModels.append(w2vModel)

    svdModel = SVD.SvdWordVectorizationModel(3, trainFileName, embeddingSize=300)
    svdEmbeddings = svdModel.train()
    svdModels.append(svdModel)

## Training Sentence Classifiers

In [None]:
trainFileName = './data/News Classification Dataset/train.csv'
w2vScores = {} # context size : scores
svdScores = {} # context size : scores

# compare the models
for w2vModel in word2VecModels:
    classifier = LstmClassifier.SentenceClassifier(trainFileName,
                                                   w2vModel,
                                                   **classifierHyperParams)
    classifier.train(epochs=5, lr=0.001, batchSize=32, verbose=True)

    w2vScores[w2vModel.contextSize] = classifier.evaluate(testDataset)

for svdModel in svdModels:
    classifier = LstmClassifier.SentenceClassifier(trainFileName,
                                                   svdModel,
                                                   **classifierHyperParams)
    classifier.train(epochs=5, lr=0.001, batchSize=32, verbose=True)

    svdScores[svdModel.contextSize] = classifier.evaluate(testDataset)

## Ranking the models

In [None]:
# rank the models
from tabulate import tabulate

# sort according to f1 scores
sortedW2vScores = sorted(w2vScores.items(), key=lambda x: x[1]['F1'], reverse=True)
sortedSvdScores = sorted(svdScores.items(), key=lambda x: x[1]['F1'], reverse=True)

print(tabulate(sortedW2vScores, headers=['Context Size', 'Scores']))
print(tabulate(sortedSvdScores, headers=['Context Size', 'Scores']))