<a href="https://colab.research.google.com/github/amalabderrahmani/Colab_Project/blob/main/Text_Classification_using_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1
#!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 2

In [2]:
!python --version

Python 3.7.13


In [3]:
#pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html

In [4]:
#!pip install -U torchtext==0.9.0

# Preprocessing data

In [5]:
import torch
from torchtext.legacy import data, datasets

import random

In [6]:
seed = 966
torch.manual_seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


**Fields**

[Check documentation](https://pytorch.org/text/_modules/torchtext/data/field.html)

In [7]:
# define fields
TEXT= data.Field(tokenize='spacy', lower= True)
LABEL= data.LabelField()


**Text REtrieval Conference (TREC) Question Classification Dataset**

*Data Examples and Six Categories:*

| Text | Label | Category |
| --- | --- | --- |
|CNN is the abbreviation for what ?|ABBR| ABBREVIATION |
| What is the date of Boxing Day ? | NUM |NUMERIC|
|Who discovered electricity ?| HUM |HUMAN|
|What 's the colored part of the eye called ?|ENTY|ENTITY|
|Why do horseshoes bring luck ?|DESC|DESCRIPTION|
|What is California 's capital ?|LOC|LOCATION|

In [8]:
from torchtext.legacy.datasets import TREC

In [9]:
TREC

torchtext.legacy.datasets.trec.TREC

In [10]:
train, test = datasets.TREC.splits(TEXT, LABEL, root='http://cogcomp.org/Data/QA/QC', train='train_5500.label', test='TREC_10.label')
train, val = train.split(random_state=random.seed(seed))

In [11]:
vars(train[-1])

{'label': 'ENTY', 'text': ['how', 'do', 'you', 'say', '2', 'in', 'latin', '?']}

In [12]:
# build vocab
TEXT.build_vocab(train, min_freq=2)
LABEL.build_vocab(train)

In [13]:
print(LABEL.vocab.stoi)

defaultdict(None, {'ENTY': 0, 'HUM': 1, 'DESC': 2, 'NUM': 3, 'LOC': 4, 'ABBR': 5})


In [14]:
print("Vocabulary size of TEXT:",len(TEXT.vocab.stoi))
print("Vocabulary size of LABEL:",len(LABEL.vocab.stoi))

Vocabulary size of TEXT: 2641
Vocabulary size of LABEL: 6


In [15]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, val, test),
    batch_size = 64,
    sort_key=lambda x: len(x.text), 
    device=device
)

# Building the CNN model

In [16]:
import torch.nn as nn
import torch.nn.functional as F

In [17]:
class CNN(nn.Module):
  def __init__(self, vocabulary_size, embedding_size, 
               kernels_number, kernel_sizes, output_size, dropout_rate):
    super().__init__()
    self.embedding = nn.Embedding(vocabulary_size, embedding_size)
    self.convolution_layers = nn.ModuleList([nn.Conv2d(in_channels=1,
                                                       out_channels=kernels_number,
                                                       kernel_size=(k, embedding_size)) for k in kernel_sizes])
    self.dropout = nn.Dropout(dropout_rate)
    self.fully_connected = nn.Linear(len(kernel_sizes) * kernels_number, output_size)


  def forward(self, text):
    text = text.permute(1,0)
    input_embeddings = self.embedding(text)
    input_embeddings = input_embeddings.unsqueeze(1)
    conved = [F.relu(convolution_layer(input_embeddings)).squeeze(3) for convolution_layer in self.convolution_layers]
    pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved ]
    concat = self.dropout(torch.cat(pooled, dim = 1 ))
    final_output = self.fully_connected(concat)



    

    return final_output

In [18]:
vocabulary_size = 2679
embedding_size = 100
kernels_number = 100
kernel_sizes = [2, 3, 4]
output_size = 6
dropout_rate = 0.3

In [19]:
model = CNN(vocabulary_size, embedding_size, kernels_number, kernel_sizes, output_size, dropout_rate)

In [20]:
print(model)

CNN(
  (embedding): Embedding(2679, 100)
  (convolution_layers): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(2, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
  )
  (dropout): Dropout(p=0.3, inplace=False)
  (fully_connected): Linear(in_features=300, out_features=6, bias=True)
)


In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

CNN(
  (embedding): Embedding(2679, 100)
  (convolution_layers): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(2, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
  )
  (dropout): Dropout(p=0.3, inplace=False)
  (fully_connected): Linear(in_features=300, out_features=6, bias=True)
)

# Training

In [22]:
import torch.optim as optim
import torch.nn as nn

criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

optimizer = optim.Adam(model.parameters())

In [23]:
def accuracy(predictions, actual_label):
    max_predictions = predictions.argmax(dim = 1, keepdim = True, )
    correct_predictions = max_predictions.squeeze(1).eq(actual_label)
    accuracy = correct_predictions.sum() / torch.cuda.FloatTensor([actual_label.shape[0]])
    return accuracy

In [24]:
def train(model, iterator, optimizer, criterion):

    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for batch in iterator:
        # optimizer
        optimizer.zero_grad()
        
        # predictions
        predictions = model(batch.text)

        # loss
        loss = criterion(predictions, batch.label)
        
        # accuracy
        acc = accuracy(predictions, batch.label)

        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [25]:
def evaluate(model, iterator, criterion):

    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text)
            
            loss = criterion(predictions, batch.label)
            
            acc = accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [26]:
number_of_epochs = 20

best_acc = float('-inf')

for epoch in range(number_of_epochs):
    
    # Write the code here
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    # Write the code here
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    if valid_acc > best_acc:
        # Write the code here
        best_acc = valid_acc
        torch.save(model.state_dict(), 'trec.pt')
    
    print(f'Epoch {epoch+1} ')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Validation Loss: {valid_loss:.3f} |  Validation Acc: {valid_acc*100:.2f}%')

Epoch 1 
	Train Loss: 1.318 | Train Acc: 47.93%
	 Validation Loss: 0.977 |  Validation Acc: 62.11%
Epoch 2 
	Train Loss: 0.797 | Train Acc: 71.49%
	 Validation Loss: 0.759 |  Validation Acc: 72.44%
Epoch 3 
	Train Loss: 0.589 | Train Acc: 80.09%
	 Validation Loss: 0.664 |  Validation Acc: 74.09%
Epoch 4 
	Train Loss: 0.451 | Train Acc: 85.73%
	 Validation Loss: 0.638 |  Validation Acc: 75.67%
Epoch 5 
	Train Loss: 0.359 | Train Acc: 89.02%
	 Validation Loss: 0.604 |  Validation Acc: 77.17%
Epoch 6 
	Train Loss: 0.276 | Train Acc: 92.09%
	 Validation Loss: 0.551 |  Validation Acc: 79.82%
Epoch 7 
	Train Loss: 0.209 | Train Acc: 94.48%
	 Validation Loss: 0.541 |  Validation Acc: 80.01%
Epoch 8 
	Train Loss: 0.167 | Train Acc: 95.53%
	 Validation Loss: 0.565 |  Validation Acc: 79.25%
Epoch 9 
	Train Loss: 0.127 | Train Acc: 96.94%
	 Validation Loss: 0.525 |  Validation Acc: 81.62%
Epoch 10 
	Train Loss: 0.104 | Train Acc: 97.76%
	 Validation Loss: 0.533 |  Validation Acc: 81.92%
Epoch 11 

In [27]:
model.load_state_dict(torch.load('trec.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.392 | Test Acc: 87.38%
