In [2]:
import importlib
import pickle

trainFileName = './data/News Classification Dataset/train.csv'
testFileName = './data/News Classification Dataset/test.csv'

# Training Embedding Models
## Word2Vec

In [6]:
import word_vectorization.models.word2vec.Word2Vec as Word2Vec
importlib.reload(Word2Vec)

word2VecModel = Word2Vec.Word2Vec(2, trainFileName, embeddingSize=300, k=3)
word2VecEmbeddings = word2VecModel.train(epochs=10, lr=0.005, batchSize=2**12, verbose=True)

Loaded training data. Vocabulary size: 43605
Found and loaded trained embeddings.


## SVD

In [9]:
import word_vectorization.models.svd.SVD as SVD
importlib.reload(SVD)

svdModel = SVD.SvdWordVectorizationModel(5, trainFileName, embeddingSize=300)
svdEmbeddings = svdModel.train()

Loaded training data. Vocabulary size: 43605
Embeddings not found. Starting training from scratch.
Computed co-occurence matrix.
Computed Partial Singular Value Decomposition of the co-occurence matrix.


# Sentence Classification


In [4]:
import word_vectorization.classification.LstmClassifier as LstmClassifier
import word_vectorization.datasets.ClassificationDataset as ClassificationDataset
importlib.reload(LstmClassifier)
importlib.reload(ClassificationDataset)

classifierHyperParams = {'hiddenSize': 256, 'numLayers': 3, 'bidirectional': True, 'hiddenLayers': [128, 64], 'activation': 'tanh'}

## Word2Vec
We will use the embeddings obtained from Word2Vec for classification.

In [12]:
word2VecClassifier = LstmClassifier.SentenceClassifier(trainFileName,
                                                       word2VecModel,
                                                       **classifierHyperParams)
word2VecClassifier.train(epochs=4,
                         lr=0.001,
                         batchSize=32,
                         verbose=True)

Saved model not found or retrain flag is set. Starting training from scratch.


Training: 100%|██████████| 3750/3750 [34:48<00:00,  1.80it/s]


Epoch 1/4 | Loss: 0.867


Training: 100%|██████████| 3750/3750 [29:08<00:00,  2.14it/s]


Epoch 2/4 | Loss: 0.359


Training: 100%|██████████| 3750/3750 [24:48<00:00,  2.52it/s]  


Epoch 3/4 | Loss: 0.248


Training: 100%|██████████| 3750/3750 [23:04<00:00,  2.71it/s]  

Epoch 4/4 | Loss: 0.188





## SVD
We will use the embeddings obtained from SVD for classification.

In [None]:
svdClassifier = LstmClassifier.SentenceClassifier(trainFileName,
                                                  svdModel,
                                                  **classifierHyperParams)
svdClassifier.train(epochs=10,
                    lr=0.001,
                    batchSize=32,
                    verbose=True)

Found and loaded trained model.


# Testing

In [21]:
trainDataset = ClassificationDataset.ClassificationDataset(trainFileName, word2VecModel.wordIndices)
testDataset = ClassificationDataset.ClassificationDataset(testFileName, word2VecModel.wordIndices)

## Word2Vec Classifier
### Test set evaluation

In [15]:
word2VecScores = word2VecClassifier.evaluate(testDataset)
print(word2VecScores['Report'])

              precision    recall  f1-score   support

           0       0.88      0.92      0.90      1900
           1       0.94      0.97      0.96      1900
           2       0.90      0.84      0.87      1900
           3       0.89      0.89      0.89      1900

    accuracy                           0.91      7600
   macro avg       0.91      0.91      0.91      7600
weighted avg       0.91      0.91      0.91      7600



### Train set evaluation

## SVD Classifier

In [None]:
svdScores = svdClassifier.evaluate(testDataset)
print(svdScores['Report'])

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1900
           1       0.00      0.00      0.00      1900
           2       0.00      0.00      0.00      1900
           3       0.25      1.00      0.40      1900

    accuracy                           0.25      7600
   macro avg       0.06      0.25      0.10      7600
weighted avg       0.06      0.25      0.10      7600



# Hyperparameter Tuning
We will try different context sizes for each model.

## Training word embeddings

In [45]:
contextSizes = [ 2, 3, 4 ]

word2VecModels = []
svdModels = []

trainFileName = './data/News Classification Dataset/train.csv'

for contextSize in contextSizes:
    w2vModel = Word2Vec.Word2Vec(contextSize, trainFileName, embeddingSize=300, k=3)
    w2vEmbeddings = w2vModel.train(epochs=10, lr=0.005, batchSize=2**12, verbose=True)
    word2VecModels.append(w2vModel)

    svdModel = SVD.SvdWordVectorizationModel(contextSize, trainFileName, embeddingSize=100)
    svdEmbeddings = svdModel.train()
    svdModels.append(svdModel)

Loaded training data. Vocabulary size: 43605
Found and loaded trained embeddings.
Loaded training data. Vocabulary size: 43605
Embeddings not found. Starting training from scratch.
Computed co-occurence matrix.
Computed Partial Singular Value Decomposition of the co-occurence matrix.
Loaded training data. Vocabulary size: 43605
Found and loaded trained embeddings.
Loaded training data. Vocabulary size: 43605
Embeddings not found. Starting training from scratch.
Computed co-occurence matrix.
Computed Partial Singular Value Decomposition of the co-occurence matrix.
Loaded training data. Vocabulary size: 43605
Found and loaded trained embeddings.
Loaded training data. Vocabulary size: 43605
Embeddings not found. Starting training from scratch.
Computed co-occurence matrix.
Computed Partial Singular Value Decomposition of the co-occurence matrix.


## Training Sentence Classifiers
### Word2Vec

In [22]:
w2vScores = {} # context size : scores
for w2vModel in word2VecModels:
    classifier = LstmClassifier.SentenceClassifier(trainFileName,
                                                   w2vModel,
                                                   **classifierHyperParams)
    classifier.train(epochs=4, lr=0.001, batchSize=32, verbose=True)

    w2vScores[w2vModel.contextSize] = classifier.evaluate(testDataset)

    pickle.dump(w2vScores, open('w2v_scores.pkl', 'wb'))

Found and loaded trained model.
Found and loaded trained model.
Found and loaded trained model.


### SVD

In [46]:
svdScores = {} # context size : scores
trainFileName = './data/W2vData/train.csv'
def trainSvdModel(svdModel):
    classifier = LstmClassifier.SentenceClassifier(trainFileName,
                                                   svdModel,
                                                   **classifierHyperParams)
    classifier.train(epochs=15, lr=0.005, batchSize=32, verbose=True)

    svdScores[svdModel.contextSize] = classifier.evaluate(testDataset)

    pickle.dump(svdScores, open('svd_scores.pkl', 'wb'))

In [47]:
trainSvdModel(svdModels[0])

Saved model not found or retrain flag is set. Starting training from scratch.


Training: 100%|██████████| 469/469 [02:37<00:00,  2.98it/s]


Epoch 1/15 | Loss: 1.389


Training: 100%|██████████| 469/469 [02:31<00:00,  3.10it/s]


Epoch 2/15 | Loss: 1.387


Training: 100%|██████████| 469/469 [02:34<00:00,  3.04it/s]


Epoch 3/15 | Loss: 1.387


Training: 100%|██████████| 469/469 [02:35<00:00,  3.02it/s]


Epoch 4/15 | Loss: 1.387


Training: 100%|██████████| 469/469 [02:34<00:00,  3.04it/s]


Epoch 5/15 | Loss: 1.387


Training: 100%|██████████| 469/469 [02:31<00:00,  3.10it/s]


Epoch 6/15 | Loss: 1.387


Training: 100%|██████████| 469/469 [02:32<00:00,  3.07it/s]


Epoch 7/15 | Loss: 1.387


Training: 100%|██████████| 469/469 [02:32<00:00,  3.07it/s]


Epoch 8/15 | Loss: 1.388


Training: 100%|██████████| 469/469 [02:37<00:00,  2.98it/s]


Epoch 9/15 | Loss: 1.387


Training: 100%|██████████| 469/469 [02:30<00:00,  3.11it/s]


Epoch 10/15 | Loss: 1.386


Training: 100%|██████████| 469/469 [02:35<00:00,  3.02it/s]


Epoch 11/15 | Loss: 1.388


Training: 100%|██████████| 469/469 [02:31<00:00,  3.10it/s]


Epoch 12/15 | Loss: 1.387


Training: 100%|██████████| 469/469 [02:30<00:00,  3.12it/s]


Epoch 13/15 | Loss: 1.389


Training: 100%|██████████| 469/469 [02:25<00:00,  3.21it/s]


Epoch 14/15 | Loss: 1.386


Training: 100%|██████████| 469/469 [02:25<00:00,  3.22it/s]


Epoch 15/15 | Loss: 1.387


In [None]:
trainSvdModel(svdModels[1])

In [49]:
trainSvdModel(svdModels[2])

In [None]:
classifier = LstmClassifier.SentenceClassifier(trainFileName,
                                               svdModels[0],
                                               **classifierHyperParams)
classifier.train(epochs=10, lr=0.001, batchSize=32, verbose=True)

svdScores[2] = classifier.evaluate(testDataset)

## Ranking the models

In [4]:
# rank the models
from tabulate import tabulate

word2VecScores = pickle.load(open('w2v_scores.pkl', 'rb'))
svdScores = pickle.load(open('svd_scores.pkl', 'rb'))

# sort according to f1 scores
sortedW2vScores = sorted(word2VecScores.items(), key=lambda x: x[1]['F1'], reverse=True)
sortedSvdScores = sorted(svdScores.items(), key=lambda x: x[1]['F1'], reverse=True)

displayMetrics = 'Accuracy', 'F1', 'Precision', 'Recall'
print(tabulate( (( (key,) + tuple(scores[metric] for metric in scores if metric in displayMetrics)) for key, scores in sortedW2vScores), headers=('Context Size',) + displayMetrics, tablefmt="fancy_grid"))
print(tabulate( (( (key,) + tuple(scores[metric] for metric in scores if metric in displayMetrics)) for key, scores in sortedSvdScores), headers=('Context Size',) + displayMetrics, tablefmt="fancy_grid"))

╒════════════════╤════════════╤══════════╤═════════════╤══════════╕
│   Context Size │   Accuracy │       F1 │   Precision │   Recall │
╞════════════════╪════════════╪══════════╪═════════════╪══════════╡
│              3 │   0.911184 │ 0.911222 │    0.911921 │ 0.911184 │
├────────────────┼────────────┼──────────┼─────────────┼──────────┤
│              2 │   0.905921 │ 0.905408 │    0.905792 │ 0.905921 │
├────────────────┼────────────┼──────────┼─────────────┼──────────┤
│              4 │   0.861579 │ 0.861858 │    0.875226 │ 0.861579 │
╘════════════════╧════════════╧══════════╧═════════════╧══════════╛
╒════════════════╤════════════╤══════╤═════════════╤══════════╕
│   Context Size │   Accuracy │   F1 │   Precision │   Recall │
╞════════════════╪════════════╪══════╪═════════════╪══════════╡
│              2 │       0.25 │  0.1 │      0.0625 │     0.25 │
╘════════════════╧════════════╧══════╧═════════════╧══════════╛
