In [13]:
import pandas as pd
import re

def loaddata(dataFile): #load data line by line
    data = pd.read_csv(dataFile, sep='\n', header=None)
    data.columns = ['data']
    data[['lan', 'tweet']] = data['data'].str.split('\t', expand=True)
    data['tweet'] = data['tweet'].str.lower().map(
        lambda x: re.sub(r'\W+', '', x))  # remove regular char
    data['tweet'] = data['tweet'].map(
        lambda x: re.sub(r'\d+', '', x))  # remove digits
    return data['tweet'],data['lan']

X_train,y_train=loaddata('train.tsv')
X_valid,y_valid=loaddata('val.tsv')
X_test,y_test=loaddata('test.tsv')
print('train:',len(y_train))
print('valid:',len(y_valid))
print('test:',len(y_test))

train: 80175
valid: 11759
test: 14960


Preprocessing for Neural Network

In [0]:
import keras
from keras.preprocessing.text import Tokenizer
from torchtext import data
import numpy as np

# add start token and end token
def preprocess(tweets,labels):
  field = data.Field(init_token='<S>', eos_token='</S>', pad_token='</S>')
  tweets = field.pad(tweets)
  print('one tweet after padding:\n',tweets[0])
  t = keras.preprocessing.text.Tokenizer(num_words=64,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=' ', char_level = True, oov_token='<#>', document_count=0)
  t.fit_on_texts(tweets)
  x = t.texts_to_sequences(tweets)
  count = 10
  vocabulary, out_of_vocabulary =[],[] 
  i=1
  for w,c in t.word_counts.items():
    if c < count:
      out_of_vocabulary.append(w)
    else:
      vocabulary.append((w,i))
      i+=1
  print('\n number of vocabulary ids:\n',len(vocabulary))

  # after get the number, we can plug it into Tokenizer using num_words=64
  voc_table=[(k,v) for k,v in t.word_index.items()]
  print('\ntable of vocabulary ids:\n',voc_table[:64])
  # index for different languages
  l = keras.preprocessing.text.Tokenizer(split='\n')
  # fit the tokenizer on the documents
  l.fit_on_texts(labels)
  y = l.texts_to_sequences(labels)
  y = np.asarray(y)
  lan_table=[(k,v-1) for k,v in l.word_index.items()]
  for i in range(len(y)):
    y[i]=y[i]-1
  print('\n\ntable of language ids:\n',lan_table)
  return x,y

In [15]:
x,y = preprocess(X_train,y_train)

one tweet after padding:
 ['<S>', 'a', 'l', 'e', 'm', 'a', 'n', 'i', 'a', 'v', 's', 'a', 'r', 'g', 'e', 'n', 't', 'i', 'n', 'a', 'l', 'a', 't', 'e', 'r', 'c', 'e', 'r', 'a', 'e', 's', 'l', 'a', 'v', 'e', 'n', 'c', 'i', 'd', 'a', 'e', 'l', 'm', 'u', 'n', 'd', 'i', 'a', 'l', 'd', 'e', 'b', 'r', 'a', 's', 'i', 'l', 'c', 'i', 'e', 'r', 'r', 'a', 'e', 's', 't', 'e', 'd', 'o', 'm', 'i', 'n', 'g', 'o', 'c', 'o', 'n', 'u', 'n', 'a', 'r', 'e', 'e', 'd', 'i', 'c', 'i', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>', '</S>']

 number of vocabulary ids:
 64

table of vocabulary ids:
 [('<#>', 1), ('</s>', 2), ('e', 3), ('a', 4), ('o', 5), ('s', 6), ('i', 7), ('t', 8), ('n', 9), ('r', 10), ('l', 11), ('d', 12), ('u', 13), ('m', 14)

In [0]:
x_valid,y_valid = preprocess(X_valid,y_valid)
x_test,y_test = preprocess(X_test,y_test)

textCNN model including embedding layer, convolution layer, pooling layer and linear layer

In [0]:
#multiple filters
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
class textCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs,embedding_dim)) for fs in filter_sizes])
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(len(filter_sizes)*n_filters, output_dim)
       
    def forward(self, x):
        
        embedded = self.embedding(x)
                
        #embedded = [batch size, length, emb dim]
        
        embedded = embedded.unsqueeze(1) # add channel dimension
        
        #embedded = [batch size, 1, length, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] #parallel computing
            
        #conv_n = [batch size, n_filters, length - filter_sizes[n]+1]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim=1))

        #cat = [batch size, n_filters * len(filter_sizes)]
       
            
        return self.linear(cat)

training model

In [0]:
def train(model,x,y,batch_size):
    
    batch_num = int(len(x)/batch_size)
    
    train_x = Variable(torch.LongTensor(x))
    train_y = Variable(torch.LongTensor(y)).squeeze(1)
    
    for t in range(100):
      
      epoch_loss = 0
      epoch_acc = 0
      
      print(t)
      
      for i in range(batch_num):

          batch_x = train_x[i*batch_size:(i+1)*batch_size]
          batch_y = train_y[i*batch_size:(i+1)*batch_size]

          optimizer.zero_grad()
        
          predictions = model(batch_x)
          
          loss = loss_function(predictions, batch_y)
           
          acc = accuracy(predictions, batch_y)
        
          loss.backward()
        
          optimizer.step()

          epoch_loss += loss.item()
          epoch_acc += acc.item()
     
      print(epoch_acc/(batch_num*100))
          
    return epoch_loss/batch_num, epoch_acc / len(x) # return the average loss and the total accuracy

In [0]:
# def train(model,x,y,batch_size):
    
    
#     train_x = Variable(torch.LongTensor(x))
#     train_y = Variable(torch.LongTensor(y)).squeeze(1)
    
#     epoch = 10000
    
#     for t in range(epoch):
      
#       # randomly choose
      
#       idx = torch.randperm(len(x))[:batch_size]

#       batch_x = train_x[idx]
#       batch_y = train_y[idx]

#       optimizer.zero_grad()
        
#       predictions = model(batch_x)
          
#       loss = loss_function(predictions, batch_y)
           
#       acc = accuracy(predictions, batch_y)
        
#       loss.backward()
        
#       optimizer.step()
      
#       #print(acc.item()/batch_size)

      
#     return loss.item(), acc.item()/batch_size# return the average loss and the total accuracy

In [0]:
def accuracy(preds, y):
   
    max_preds = preds.argmax(dim=1, keepdim=True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    num = correct.sum()
    return num # return number of correct ones

In [0]:
def evaluate(model,x,y):
    
   
    valid_x = Variable(torch.LongTensor(x))
    valid_y = Variable(torch.LongTensor(y)).squeeze(1)
          
    predictions = model(valid_x)
    loss = loss_function(predictions, valid_y)
    acc = accuracy(predictions, valid_y)
        
    return loss.item(), acc.item() / len(x),predictions

try the model, it makes sense

In [0]:
N_EPOCHS = 20

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(cnn, optimizer, loss_function)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 1.311 | Train Acc: 55.44% |
| Epoch: 02 | Train Loss: 0.949 | Train Acc: 70.30% |
| Epoch: 03 | Train Loss: 0.855 | Train Acc: 73.36% |
| Epoch: 04 | Train Loss: 0.805 | Train Acc: 74.86% |
| Epoch: 05 | Train Loss: 0.781 | Train Acc: 75.50% |
| Epoch: 06 | Train Loss: 0.760 | Train Acc: 76.00% |
| Epoch: 07 | Train Loss: 0.744 | Train Acc: 76.66% |
| Epoch: 08 | Train Loss: 0.732 | Train Acc: 76.89% |
| Epoch: 09 | Train Loss: 0.723 | Train Acc: 77.06% |
| Epoch: 10 | Train Loss: 0.715 | Train Acc: 77.46% |
| Epoch: 11 | Train Loss: 0.710 | Train Acc: 77.54% |
| Epoch: 12 | Train Loss: 0.703 | Train Acc: 77.70% |
| Epoch: 13 | Train Loss: 0.698 | Train Acc: 78.02% |
| Epoch: 14 | Train Loss: 0.700 | Train Acc: 77.85% |
| Epoch: 15 | Train Loss: 0.698 | Train Acc: 77.86% |
| Epoch: 16 | Train Loss: 0.690 | Train Acc: 78.19% |
| Epoch: 17 | Train Loss: 0.688 | Train Acc: 78.25% |
| Epoch: 18 | Train Loss: 0.691 | Train Acc: 78.13% |
| Epoch: 19 | Train Loss: 0.

Use validation set to choose num of filters :

Actually, there are three type of filter sizes,
so if Filter_num=10, it means there are 10*3=30 filters

In [0]:
vocab_size = 64
char_dim = 10
output_dim = 9
filter_sizes = [2,3,4]
n_filters=[2,5,10]

for i in range(3):

    cnn = textCNN(vocab_size, char_dim, n_filters[i],filter_sizes, output_dim,dropout=0)
    optimizer = torch.optim.Adam(cnn.parameters())
    loss_function = nn.CrossEntropyLoss()
    
    print(f'Filter_num: {n_filters[i]:1}')
    
    train_loss, train_acc = train(cnn,x,y,batch_size=100)
    
    print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% ')
    
    valid_loss, valid_acc = evaluate(cnn,x_valid,y_valid)
    
    print(f'Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}% ')

Filter_num: 2
Train Loss: 0.701 | Train Acc: 77.54% 
Valid Loss: 1.125 | Valid Acc: 72.75% 
Filter_num: 5
Train Loss: 0.568 | Train Acc: 81.71% 
Valid Loss: 0.844 | Valid Acc: 78.03% 
Filter_num: 10
Train Loss: 0.440 | Train Acc: 85.93% 
Valid Loss: 0.840 | Valid Acc: 81.31% 


Use more iterations to train the model

In [0]:
vocab_size = 64
char_dim = 10
output_dim = 9
filter_sizes = [2,3,4]
n_filters = 10

cnn = textCNN(vocab_size, char_dim, n_filters,filter_sizes, output_dim,dropout=0)
optimizer = torch.optim.Adam(cnn.parameters())
loss_function = nn.CrossEntropyLoss()
    
train_loss, train_acc = train(cnn,x,y,batch_size=100)
    
print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% ')
    
valid_loss, valid_acc = evaluate(cnn,x_valid,y_valid)
    
print(f'Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}% ')

Perplexity to evaluate the model

In [67]:
valid_loss, valid_acc, preds = evaluate(cnn,x_valid,y_valid)
print('Cross Entropy:',valid_loss)

pxs = torch.nn.functional.softmax(Variable(preds), dim=1).data
p_x, indices = torch.max(pxs, 1)


H_x = torch.sum(p_x)/len(preds)

print('Entropy:',H_x.data)

Cross Entropy: 0.8650198578834534
Entropy: tensor(0.8658)


Use the test data set

In [0]:
test_loss, test_acc = evaluate(cnn,x_test,y_test)
    
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% ')

Test Loss: 0.944 | Test Acc: 73.15% 


In [0]:
import collections

import numpy as np


def Metrics(preds, labs, show=True):
#   """Print precision, recall and F1 for each language.
#   Assumes a single language per example, i.e. no code switching.
#   Args:
#     preds: list of predictions
#     labs: list of labels
#     show: flag to toggle printing
#   """
    all_langs = set(preds + labs)
    preds = np.array(preds)
    labs = np.array(labs)
    label_totals = collections.Counter(labs)
    pred_totals = collections.Counter(preds)
    confusion_matrix = collections.Counter(zip(preds, labs))
    num_correct = 0
    num_lan = 9
    for lang in range(num_lan):
        num_correct += confusion_matrix[(lang, lang)]
    acc = num_correct / float(len(preds))
    print ('accuracy = {0:.3f}'.format(acc))
    if show:
        print (' Lang     Prec.   Rec.   F1')
        print ('------------------------------')
    scores = []
    fmt_str = '  {0:6}  {1:6.2f} {2:6.2f} {3:6.2f}'
    for lang in range(num_lan):
        
        total = max(1.0, pred_totals[lang])
        precision = 100.0 * confusion_matrix[(lang, lang)] / total
        
        total = max(1.0, label_totals[lang])
        recall = 100.0 * confusion_matrix[(lang, lang)] / total
        if precision + recall == 0.0:
            f1 = 0.0
        else:
            f1 = 2.0 * precision * recall / (precision + recall)
        scores.append([precision, recall, f1])
        if show:
            print (fmt_str.format(lang, precision, recall, f1))
        totals = np.array(scores).mean(axis=0)
        if show:
            print('------------------------------')
    print(fmt_str.format('Total:', totals[0], totals[1], totals[2]))
    return totals[2]

class MovingAvg(object):
  
    def __init__(self, p):
        self.val = None
        self.p = p

    def Update(self, v):
        if self.val is None:
            self.val = v
            return v
        self.val = self.p * self.val + (1.0 - self.p) * v
        return self.val

In [0]:
test_x = Variable(torch.LongTensor(x_test))
test_y = Variable(torch.LongTensor(y_test)).squeeze(1)
          
output = cnn(test_x)
preds = output.argmax(dim=1, keepdim=True).squeeze(1)
    
total = Metrics(preds,test_y)

accuracy = 0.731
 Lang     Prec.   Rec.   F1
------------------------------
       0   79.81  83.65  81.68
------------------------------
       1   72.80  78.91  75.73
------------------------------
       2   77.63  74.96  76.27
------------------------------
       3   44.50  40.27  42.28
------------------------------
       4   46.94  38.62  42.37
------------------------------
       5   46.51  26.91  34.09
------------------------------
       6   77.02  57.14  65.61
------------------------------
       7    0.62   0.51   0.56
------------------------------
       8    0.00   0.00   0.00
------------------------------
  Total:   49.54  44.55  46.51


In [0]:
from collections import Counter
from collections import OrderedDict
import operator
y_cnt=Counter()
for word in y_train:
    y_cnt[word]+=1
y_cnt=dict(y_cnt)
y_freq=OrderedDict(sorted(y_cnt.items(), key=lambda t: t[1],reverse=True))
print(y_freq)

OrderedDict([('en', 25979), ('es', 24712), ('pt', 18466), ('fr', 3704), ('ca', 2839), ('it', 1235), ('de', 1157), ('eu', 1051), ('gl', 1032)])


Recall that table of language ids:

 [('en', 0), ('es', 1), ('pt', 2), ('fr', 3), ('ca', 4), ('it', 5), ('de', 6), ('eu', 7), ('gl', 8)]
 
 It seems that the language with larger frequency will have larger F1