<a href="https://colab.research.google.com/github/andys0tc/ML2018/blob/master/Classification_HNW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import spacy
import nltk.data
import glob

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, BucketIterator,LabelField
from torchtext import data
from collections import Counter
import spacy
import numpy as np
import re
import random
import math
import time
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.autograd import Variable

In [0]:
from google.colab import files
uploaded = files.upload()

Saving testset_classification.csv to testset_classification (5).csv
Saving trainset_classification.csv to trainset_classification (5).csv


In [0]:
nlp = spacy.load('en')

TRAIN_SIZE = 0.8
TEST_SIZE = 0.2

In [0]:
def tokenize_input(text):
    return [tok.text for tok in nlp.tokenizer(text)]

In [0]:
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.int64)

In [0]:
class DataFrameDataset(data.Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.target if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)


In [0]:
#PATH = '/home/julia/PycharmProjects/seq2seq/data/'
train = pd.read_csv('trainset_classification (5).csv')
validation = pd.read_csv('testset_classification (5).csv')

In [0]:
validation.head()

Unnamed: 0,text,target
0,Do tell me what it is.,2
1,I know why I love him.,2
2,"Yes, Frazer said.",0
3,The bottles came.,0
4,Mrs. Erlynne is coming here to-night.,2


In [0]:
def index_to_one_hot(label):
  sample_nums=label.size()[0]
  one_hot=torch.tensor([0,1,2])
  one_hot=one_hot.view([1,3]).expand([sample_nums,3])
  label=label.view([sample_nums,1]).expand([sample_nums,3])
  one_hot=(label.float()==one_hot).float()
  return one_hot

In [0]:
fields = [('text',TEXT), ('label',LABEL)]

train_ds, val_ds = DataFrameDataset.splits(fields, train_df=train, val_df=validation)

In [0]:
print(vars(train_ds[0]))

# Check the type 
print(type(train_ds[0]))

{'text': ['56', 'Victory', 'of', 'Knowledge', 'over', 'Radical', 'Evil', '.', '—'], 'label': 1}
<class 'torchtext.data.example.Example'>


In [0]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_ds, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = 'glove.6B.200d',
                 unk_init = torch.Tensor.zero_)

In [0]:
LABEL.build_vocab(train_ds)

In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_ds, val_ds), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

In [0]:
device

device(type='cpu')

In [0]:
# Hyperparameters
num_epochs = 10
learning_rate = 0.001

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
N_CLASSES = 3
BIDIRECTIONAL = True
DROPOUT = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # padding

In [0]:
class LSTM_net(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, num_classes,
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        
        self.fc2 = nn.Linear(hidden_dim, num_classes)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        # text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        # embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        # output = [sent len, batch size, hid dim * num directions]
        # output over padding tokens are zero tensors
        
        # hidden = [num layers * num directions, batch size, hid dim]
        # cell = [num layers * num directions, batch size, hid dim]
        
        # concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        # and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        output = self.fc1(hidden)
        output = self.dropout(self.fc2(output))
                
        #hidden = [batch size, hid dim * num directions]
            
        return output

In [0]:

model = LSTM_net(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, N_CLASSES,
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [0]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)

torch.Size([25002, 200])


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1765,  0.2921, -0.0021,  ..., -0.2077, -0.2319, -0.1081],
        ...,
        [-0.5171,  0.4141,  0.3585,  ...,  0.4335,  0.4331, -0.2565],
        [-0.2019,  0.1898,  0.6603,  ..., -0.1500, -0.3105,  0.3951],
        [-0.1017, -0.6736, -0.1305,  ..., -0.5237,  0.0934, -0.6144]])

In [0]:
#  to initiaise padded to zeros
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1765,  0.2921, -0.0021,  ..., -0.2077, -0.2319, -0.1081],
        ...,
        [-0.5171,  0.4141,  0.3585,  ...,  0.4335,  0.4331, -0.2565],
        [-0.2019,  0.1898,  0.6603,  ..., -0.1500, -0.3105,  0.3951],
        [-0.1017, -0.6736, -0.1305,  ..., -0.5237,  0.0934, -0.6144]])


In [0]:
model.to(device) #CNN to GPU


# Loss and optimizer
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [0]:
"""def binary_accuracy(preds, y):
    
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc"""

'def binary_accuracy(preds, y):\n    \n    #round predictions to the closest integer\n    rounded_preds = torch.round(torch.sigmoid(preds))\n    correct = (rounded_preds == y).float() #convert into float for division \n    acc = correct.sum() / len(correct)\n    return acc'

In [0]:
def compute_accuracy(preds, y):
  p_top1=preds.topk(1,dim=1)[1]
  p_top1
  y_top1=y.topk(1,dim=1)[1]
  y_top1
  correct=(p_top1==y_top1).float().sum()
  correct
  label_nums=preds.size()[0]
  return correct,label_nums



In [0]:
def train(model, iterator):
    print("Iniciando entrenamiento")
    print(type(iterator))
    epoch_loss = 0
    epoch_acc = 0
    epoch_cor = 0
    print(len(iterator))
    
    model.train()
    
    for batch in iterator:
     

        text, text_lengths = batch.text
        optimizer.zero_grad()
        
        predictions = model(text, text_lengths).squeeze(1)
        label = torch.tensor(batch.label, dtype=torch.long, device=device)
        #print(label.shape)

        loss = criterion(predictions, batch.label)
        acc_torch,acc = compute_accuracy(predictions, index_to_one_hot(label))
        #acc = binary_accuracy(predictions, label)

        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_cor += acc_torch
        epoch_acc+= acc
        #print("Process...")
       

    return epoch_loss / len(iterator), epoch_cor / epoch_acc


In [0]:
"""def evaluate(model, iterator):
   
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            #acc = binary_accuracy(predictions, batch.label)
            
            epoch_acc += acc.item()
        
    return epoch_acc / len(iterator)"""

'def evaluate(model, iterator):\n   \n    epoch_acc = 0\n    model.eval()\n    \n    with torch.no_grad():\n        for batch in iterator:\n            text, text_lengths = batch.text\n            predictions = model(text, text_lengths).squeeze(1)\n            #acc = binary_accuracy(predictions, batch.label)\n            \n            epoch_acc += acc.item()\n        \n    return epoch_acc / len(iterator)'

In [0]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_cor = 0
  epoch_label=0
  model.eval()
  with torch.no_grad():
    for batch in iterator:
      predictions = model(batch.ReviewText).squeeze(1)
      loss = criterion(predictions, batch.Overall)
      correct,label_nums = compute_accuracy(predictions, index_to_one_hot(batch.Overall))
      epoch_loss += loss.item()
      epoch_cor +=correct
      epoch_label+=label_nums
  return epoch_loss / len(iterator), epoch_cor/epoch_label

SyntaxError: ignored

In [0]:
t = time.time()
loss=[]
acc=[]
val_acc=[]

for epoch in range(num_epochs):
    
    train_loss, train_acc = train(model, train_iterator)
    #valid_acc = evaluate(model, valid_iterator)
    #print(f'\tTrain Loss: {train_loss:.3f} ')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    #print(f'\t Val. Acc: {valid_acc*100:.2f}%')
    
    loss.append(train_loss)
    #acc.append(train_acc)
    #val_acc.append(valid_acc)
    
print(f'time:{time.time()-t:.3f}')

Iniciando entrenamiento
<class 'torchtext.data.iterator.BucketIterator'>
812




	Train Loss: 0.451 | Train Acc: 81.32%
Iniciando entrenamiento
<class 'torchtext.data.iterator.BucketIterator'>
812
	Train Loss: 0.271 | Train Acc: 88.75%
Iniciando entrenamiento
<class 'torchtext.data.iterator.BucketIterator'>
812
	Train Loss: 0.208 | Train Acc: 91.09%
Iniciando entrenamiento
<class 'torchtext.data.iterator.BucketIterator'>
812
	Train Loss: 0.176 | Train Acc: 92.32%
Iniciando entrenamiento
<class 'torchtext.data.iterator.BucketIterator'>
812
	Train Loss: 0.158 | Train Acc: 92.89%
Iniciando entrenamiento
<class 'torchtext.data.iterator.BucketIterator'>
812
	Train Loss: 0.141 | Train Acc: 93.35%
Iniciando entrenamiento
<class 'torchtext.data.iterator.BucketIterator'>
812
	Train Loss: 0.130 | Train Acc: 93.82%
Iniciando entrenamiento
<class 'torchtext.data.iterator.BucketIterator'>
812
	Train Loss: 0.122 | Train Acc: 94.02%
Iniciando entrenamiento
<class 'torchtext.data.iterator.BucketIterator'>
812
	Train Loss: 0.110 | Train Acc: 94.66%
Iniciando entrenamiento
<class 't

In [0]:
x = torch.tensor(1000)
x/10

tensor(100)

In [0]:
torch.save(model.state_dict(), 'model_classification.pt')