In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import os
import pandas as pd
import re
import string
from langid.langid import LanguageIdentifier, model
import time
import numpy as np
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
def load_data_from_path(folder_path):
    examples = []
    for label in os.listdir(folder_path):
        full_path = os.path.join(folder_path, label)
        for file_name in os.listdir(full_path):
            file_path = os.path.join(full_path, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                lines = f.readlines()
            sentence = " ".join(lines)
            if label == "neg":
                label = 0
            if label == "pos":
                label = 1
            data = {
                'sentence': sentence,
                'label': label
            }
            examples.append(data)
    return pd.DataFrame(examples)

In [4]:
folder_paths = {
    'train': './data_train/train',
    'valid': './data_train/test',
    'test': './data_test/test'
}

train_df = load_data_from_path(folder_paths['train'])
valid_df = load_data_from_path(folder_paths['valid'])
test_df = load_data_from_path(folder_paths['test'])

In [4]:
train_df.head()

Unnamed: 0,sentence,label
0,Mua có mỗi Bingsu thập_cẩm 45k mà mình f đợi h...,0
1,Thứ 6 nào ta cùng quẩy 💣 💣 💣\n Vuvuzela beer c...,0
2,"Mình đi với nhóm , tổng_cộng 4 người ăn chỉ có...",0
3,"nhân_viên phục_vụ không mấy tận_tình , đồ_ăn r...",0
4,"Vào đây thì hết bàn , nhưng mình vẫn ngồi đợi ...",0


In [5]:
def identify_vn(df):
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    not_vi_idx = set()
    THRESHOLD = 0.9
    for idx, row in df.iterrows():
        score = identifier.classify(row["sentence"])
        if score[0] != "vi" or (score[0] == "vi" and score[1] <= THRESHOLD):
            not_vi_idx.add(idx)
    vi_df = df[~df.index.isin(not_vi_idx)]
    not_vi_df = df[df.index.isin(not_vi_idx)]
    return vi_df, not_vi_df

In [6]:
def preprocess_text(text):

    url_pattern = re.compile(r'https?://\s+\wwww\.\s+')
    text = url_pattern.sub(r" ", text)

    html_pattern = re.compile(r'<[^<>]+>')
    text = html_pattern.sub(" ", text)

    replace_chars = list(string.punctuation + string.digits)
    for char in replace_chars:
        text = text.replace(char, " ")
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U0001F1F2-\U0001F1F4"  # Macau flag
                               u"\U0001F1E6-\U0001F1FF"  # flags
                               u"\U0001F600-\U0001F64F"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U0001F1F2"
                               u"\U0001F1F4"
                               u"\U0001F620"
                               u"\u200d"
                               u"\u2640-\u2642"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r" ", text)

    text = " ".join(text.split())

    return text.lower()

In [7]:
train_df_vi, train_df_other = identify_vn(train_df)

train_df_vi.loc[:, 'preprocess_sentence'] = train_df_vi['sentence'].apply(
    preprocess_text)
valid_df_vi, valid_df_other = identify_vn(valid_df)
valid_df_vi.loc[:, 'preprocess_sentence'] = valid_df_vi['sentence'].apply(
    preprocess_text)
test_df_vi, test_df_other = identify_vn(test_df)
test_df_vi.loc[:, 'preprocess_sentence'] = test_df_vi['sentence'].apply(
    preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_vi.loc[:, 'preprocess_sentence'] = train_df_vi['sentence'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df_vi.loc[:, 'preprocess_sentence'] = valid_df_vi['sentence'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_vi.loc[:, 'preprocess_sentence'] = test_df_vi['

In [8]:
train_df_vi[:5]

Unnamed: 0,sentence,label,preprocess_sentence
0,Mua có mỗi Bingsu thập_cẩm 45k mà mình f đợi h...,0,mua có mỗi bingsu thập cẩm k mà mình f đợi hơn...
1,Thứ 6 nào ta cùng quẩy 💣 💣 💣\n Vuvuzela beer c...,0,thứ nào ta cùng quẩy vuvuzela beer club chung ...
2,"Mình đi với nhóm , tổng_cộng 4 người ăn chỉ có...",0,mình đi với nhóm tổng cộng người ăn chỉ có khô...
3,"nhân_viên phục_vụ không mấy tận_tình , đồ_ăn r...",0,nhân viên phục vụ không mấy tận tình đồ ăn ra ...
4,"Vào đây thì hết bàn , nhưng mình vẫn ngồi đợi ...",0,vào đây thì hết bàn nhưng mình vẫn ngồi đợi bì...


In [9]:
sum([len(x.split()) for x in train_df_vi['preprocess_sentence']]) / \
    len(train_df_vi['preprocess_sentence'])

96.07200026903416

In [8]:
train_df_has = [torch.tensor([hash(word) % 15000 for word in review.split()])
                for review in train_df_vi['preprocess_sentence']]
valid_df_has = [torch.tensor([hash(word) % 15000 for word in review.split()])
                for review in valid_df_vi['preprocess_sentence']]
test_df_has = [torch.tensor([hash(word) % 15000 for word in review.split()])
               for review in test_df_vi['preprocess_sentence']]

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 128
train_dataloader = DataLoader(
    train_df,
    batch_size=batch_size,
    shuffle=True,

)
valid_dataloader = DataLoader(
    valid_df,
    batch_size=batch_size,
    shuffle=False,

)

In [10]:
from torch.nn.utils.rnn import pad_sequence
MAX_LENGTH = 90
train_df_pad = pad_sequence(
    train_df_has, batch_first=True, padding_value=0).narrow(1, 0, MAX_LENGTH)
valid_df_pad = pad_sequence(
    valid_df_has, batch_first=True, padding_value=0).narrow(1, 0, MAX_LENGTH)
test_df_pad = pad_sequence(
    test_df_has, batch_first=True, padding_value=0).narrow(1, 0, MAX_LENGTH)

print(train_df_pad[:5])

tensor([[14028,  5557, 11241,  5681,  2405,  5207,  4925,  9507,  2430,   640,
         10430, 12785,  2102, 10244, 14010,  3303,  1200,  5557, 12121,  2463,
          8304,  6629,  1943,  9921,  9224, 11232, 10082,  6664,  2430,  4925,
         14984,  5557,  7108,  4925,  2463,  7307, 13356,  7094,   891,  2943,
          7492,  3303,  2817,  4183,  3032, 10244, 11106,  7965,  8028,   662,
         13018,  9241,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [10049, 14004,   641,  1348, 14378,  5973, 14061, 14799, 12699,  7582,
          4596,  2578,  5448,  1560, 10774, 10774,  4183, 11055,  7307, 12916,
         14534, 10204,    35,  5794,  8421,  6840,  9493,   820,  9983,  7323,
          7094, 14949,  3029,   820,  9983,  7307, 

In [12]:
train_label = torch.tensor(train_df_vi['label'].values)
valid_label = torch.tensor(valid_df_vi['label'].values)
test_label = torch.tensor(test_df_vi['label'].values)
# get the 5 items from train_label

In [13]:
train_data = torch.utils.data.TensorDataset(train_df_pad, train_label)
valid_data = torch.utils.data.TensorDataset(valid_df_pad, valid_label)
test_data = torch.utils.data.TensorDataset(test_df_pad, test_label)
# move train_data to device


train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_dataloader = DataLoader(valid_data, shuffle=False, batch_size=batch_size)

In [22]:
class TextClassificationModel(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class, seq_len):
        super(TextClassificationModel, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.ft = torch.nn.Linear(MAX_LENGTH * embed_dim, MAX_LENGTH * embed_dim)
        self.elu = torch.nn.ELU()
        self.ft2 = torch.nn.Linear(MAX_LENGTH * embed_dim, num_class)
        # self.out = torch.nn.Sigmoid()
        self.init_weights()
        
        self.dropout = torch.nn.Dropout(0.2)

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.ft.weight.data.uniform_(-initrange, initrange)
        self.ft.bias.data.zero_()
        self.ft2.weight.data.uniform_(-initrange, initrange)
        self.ft2.bias.data.zero_()

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        x = torch.nn.Flatten()(embedded)
        #x = self.ft(x)
        #x = self.elu(x)
        x = self.ft2(x)
        x = self.dropout(x)
        #x = self.simoid(x)
        # x = self.out(x)
        return x

In [27]:
vocab_size = 15000
embed_dim = 100
num_class = 2
loss = torch.nn.CrossEntropyLoss()
model = TextClassificationModel(
    vocab_size, embed_dim, num_class, MAX_LENGTH).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [16]:
def train(model, optimizer, loss, train_dataloader, epoch=0, log_interval=50):
    model.train()
    total_acc, total_count = 0, 0
    losses = []
    start_time = time.time()

    for idx, (inputs, labels) in enumerate(train_dataloader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(inputs)
        
        # compute loss
        loss_value = loss(predictions, labels)
        losses.append(loss_value.item())

        # backward
        loss_value.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        
        total_acc += (predictions.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(train_dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_acc, epoch_loss

In [17]:
def evaluate(model, criterion, valid_dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    losses = []

    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(valid_dataloader):
            inputs, labels = inputs.to(device), labels.to(device)
            predictions = model(inputs)
            
            loss = criterion(predictions, labels)
            losses.append(loss.item())
           
            total_acc += (predictions.argmax(1) == labels).sum().item()
            total_count += labels.size(0)

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_acc, epoch_loss

In [18]:
for epoch in range(1, 100):
    epoch_start_time = time.time()
    train_acc, train_loss = train(
        model, optimizer, loss, train_dataloader, epoch)
    valid_acc, valid_loss = evaluate(model, loss, valid_dataloader)
    print("-" * 80)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | valid accuracy {:8.3f} | valid loss {:8.3f}".format(
            epoch, time.time() - epoch_start_time, valid_acc, valid_loss
        )
    )
    print("-" * 80)

| epoch   1 |    50/  233 batches | accuracy    0.577
| epoch   1 |   100/  233 batches | accuracy    0.644
| epoch   1 |   150/  233 batches | accuracy    0.649
| epoch   1 |   200/  233 batches | accuracy    0.671
--------------------------------------------------------------------------------
| end of epoch   1 | time: 10.16s | valid accuracy    0.758 | valid loss    0.543
--------------------------------------------------------------------------------
| epoch   2 |    50/  233 batches | accuracy    0.830
| epoch   2 |   100/  233 batches | accuracy    0.830
| epoch   2 |   150/  233 batches | accuracy    0.827
| epoch   2 |   200/  233 batches | accuracy    0.840
--------------------------------------------------------------------------------
| end of epoch   2 | time:  9.04s | valid accuracy    0.757 | valid loss    0.654
--------------------------------------------------------------------------------
| epoch   3 |    50/  233 batches | accuracy    0.884
| epoch   3 |   100/  233 

In [19]:
# normalize the data using torch.nn.functional.normalize
train_data_normalized = torch.nn.functional.normalize(train_data.tensors[0].float(), dim=1)
valid_data_normalized = torch.nn.functional.normalize(valid_data.tensors[0].float(), dim=1)

In [20]:
train_data_normalized = torch.utils.data.TensorDataset(train_df_pad, train_label)
valid_data_normalized = torch.utils.data.TensorDataset(valid_df_pad, valid_label)
#test_data_normalized = torch.utils.data.TensorDataset(test_df_pad, test_label)

train_dataloader_nor = DataLoader(train_data_normalized, shuffle=True, batch_size=batch_size)
valid_dataloader_nor = DataLoader(valid_data_normalized, shuffle=False, batch_size=batch_size)


In [29]:
for epoch in range(1, 50):
    epoch_start_time = time.time()
    train_acc, train_loss = train(
        model, optimizer, loss, train_dataloader_nor, epoch)
    valid_acc, valid_loss = evaluate(model, loss, valid_dataloader_nor)
    print("-" * 80)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | valid accuracy {:8.3f} | valid loss {:8.3f}".format(
            epoch, time.time() - epoch_start_time, valid_acc, valid_loss
        )
    )
    print("-" * 80)

| epoch   1 |    50/  233 batches | accuracy    0.982
| epoch   1 |   100/  233 batches | accuracy    0.978
| epoch   1 |   150/  233 batches | accuracy    0.981
| epoch   1 |   200/  233 batches | accuracy    0.981
--------------------------------------------------------------------------------
| end of epoch   1 | time:  1.92s | valid accuracy    0.789 | valid loss    1.772
--------------------------------------------------------------------------------
| epoch   2 |    50/  233 batches | accuracy    0.980
| epoch   2 |   100/  233 batches | accuracy    0.981
| epoch   2 |   150/  233 batches | accuracy    0.981
| epoch   2 |   200/  233 batches | accuracy    0.981
--------------------------------------------------------------------------------
| end of epoch   2 | time:  2.03s | valid accuracy    0.791 | valid loss    1.774
--------------------------------------------------------------------------------
| epoch   3 |    50/  233 batches | accuracy    0.983
| epoch   3 |   100/  233 