In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import re

def extract_text_and_labels(filepath):
    """
    Extracts (text, labels) pairs from a VLSP2018 ABSA .txt file.
    Returns a list of tuples: [(text, [label1, label2, ...]), ...]
    """
    label_pattern = r'\{([^}]+)\}'
    data = []

    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.read().strip().split('\n')

    current_text_lines = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if line.startswith('{') and '}' in line:
            labels = re.findall(label_pattern, line)
            full_text = ' '.join(current_text_lines).strip()
            data.append((full_text, labels))
            current_text_lines = []
        else:
            current_text_lines.append(line)

    return data


def load_text_label_datasets(base_path):
    """
    Loads datasets with text and associated labels from all domain/split files.
    Returns a dictionary:
    {
        'restaurant': {'train': [...], 'dev': [...], 'test': [...]},
        'hotel': {'train': [...], 'dev': [...], 'test': [...]}
    }
    """
    dataset = {'restaurant': {}, 'hotel': {}}
    for domain in ['restaurant', 'hotel']:
        for split in ['train', 'dev', 'test']:
            filename = f"VLSP2018-SA-{domain.capitalize()}-{split}.txt"
            filepath = os.path.join(base_path, filename)
            if os.path.exists(filepath):
                dataset[domain][split] = extract_text_and_labels(filepath)
            else:
                print(f"Missing file: {filepath}")
    return dataset

base_folder = "/content/drive/MyDrive/NLP Project"
full_data = load_text_label_datasets(base_folder)

# Print example output
for domain in ['hotel', 'restaurant']:
    for split in ['train', 'dev', 'test']:
        if full_data[domain].get(split):
            text, labels = full_data[domain][split][0]
            print(f"\n[{domain.capitalize()} {split.capitalize()}]")
            print("Text:", text[:300] + ("..." if len(text) > 300 else ""))
            print("Labels:", labels)
        else:
            print(f"\n[{domain.capitalize()} {split.capitalize()}] No data")



[Hotel Train]
Text: ﻿#1 Rộng rãi KS mới nhưng rất vắng. Các dịch vụ chất lượng chưa cao và thiếu.
Labels: ['HOTEL#DESIGN&FEATURES, positive', 'HOTEL#GENERAL, negative']

[Hotel Dev]
Text: ﻿#1 Chưa có thang máy. Chưa chấp nhận thanh toán bằng thẻ. Địa điểm dễ tìm, bày trí bằng tre nứa rất mát mẻ, bạn lễ tân nhiệt tình, niềm nở, thân thiện, tốt bụng cực kỳ. Tôi đặt phòng 1 giường đôi nhưng biết tôi đi cùng 2 con nhỏ nên ks đã chủ động chuẩn bị thêm 1 chiếc giường tầng cho 2 bé. Phòng r...
Labels: ['FACILITIES#DESIGN&FEATURES, negative', 'SERVICE#GENERAL, positive', 'LOCATION#GENERAL, positive', 'HOTEL#DESIGN&FEATURES, positive', 'HOTEL#COMFORT, positive', 'ROOMS#DESIGN&FEATURES, positive', 'ROOM_AMENITIES#QUALITY, positive', 'ROOM_AMENITIES#CLEANLINESS, positive', 'HOTEL#GENERAL, positive']

[Hotel Test]
Text: ﻿#1 Ga giường không sạch, nhân viên quên dọn phòng một ngày.
Labels: ['ROOM_AMENITIES#CLEANLINESS, negative', 'SERVICE#GENERAL, negative']

[Restaurant Train]
Text: ﻿#1 _ Ản

In [3]:
import os
import re
import string
!pip install pyvi
from pyvi import ViTokenizer

# -------------------- Normalization Functions --------------------

def normalize_money(sent):
    return re.sub(r'[0-9]+[.,0-9]*[kmb]', 'giá', sent)

def normalize_hastag(sent):
    return re.sub(r'#(\w+)', r'\1', sent)

def normalize_website(sent):
    result = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F]{2}))+', 'website', sent)
    return re.sub(r'\w+(\.(com|vn|me))+((/+([\w\.\-]+)?)+)?', 'website', result)

def nomalize_emoji(sent):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\u2600-\u26FF\u2700-\u27BF"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', sent)

def normalize_elongate(sent):
    pattern = r'(.)\1{1,}'
    result = sent
    while re.search(pattern, result):
        repeat_char = re.search(pattern, result)
        result = result.replace(repeat_char.group(0), repeat_char.group(1))
    return result

def remove_number(sent):
    return re.sub(r'[0-9]+', '', sent)

def normalize_acronyms(sent):
    replace_list = {
        'ô kêi': ' ok ', 'okie': ' ok ', ' o kê ': ' ok ', 'okey': ' ok ',
        'authentic': ' chuẩn chính hãng ', 'fake': ' giả mạo ', 'shop': ' cửa hàng ',
        'gud': ' tốt ', 'wel done': ' tốt ', 'good': ' tốt ', 'bad': ' tệ ',
        'huhu': ' tiêu cực ', 'haha': ' tích cực ', 'cute': ' dễ thương ', 'lol': ' tiêu cực ',
        'thanks': ' cám ơn ', 'thks': ' cám ơn ', 'tks': ' cám ơn ',
        'ship': ' giao hàng ', 'delivery': ' giao hàng ', 'rep': ' trả lời ',
        'fb': ' facebook ', 'face': ' facebook ', 'sp': ' sản phẩm ',
        'nt': ' nhắn tin ', 'tl': ' trả lời ', 'dt': ' điện thoại ', 'sd': ' sử dụng ',
        'bt': ' bình thường ', 'perfect': ' rất tốt ', 'nice': ' tốt ', 'fresh': ' tươi ',
        'iu': ' yêu ', 'dep': ' đẹp ', 'xau': ' xấu ', 'delicious': ' ngon ',
        'fback': ' feedback ', 'fedback': ' feedback '
    }
    for k, v in replace_list.items():
        sent = sent.replace(k, v)
    return sent

def normalize(sent):
    sent = normalize_money(sent)
    sent = normalize_hastag(sent)
    sent = normalize_website(sent)
    sent = nomalize_emoji(sent)
    sent = normalize_elongate(sent)
    sent = normalize_acronyms(sent)
    sent = remove_number(sent)
    sent = sent.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    sent = re.sub(r'\s+', ' ', sent).strip()
    return sent

def tokenize(sent):
    return ViTokenizer.tokenize(sent)



Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.11 pyvi-0.1.1 sklearn-crfsuite-0.5.0


In [4]:
# -------------------- Review + Label Reader --------------------

def extract_clean_tokenized_text_and_labels(filepath):
    """
    Extracts cleaned and tokenized (text, labels) pairs from VLSP2018 file.
    """
    label_pattern = r'\{([^}]+)\}'
    data = []

    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.read().strip().split('\n')

    current_text_lines = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if line.startswith('{') and '}' in line:
            labels = re.findall(label_pattern, line)
            raw_text = ' '.join(current_text_lines).strip()
            cleaned = normalize(raw_text)
            tokenized = tokenize(cleaned)
            data.append((tokenized, labels))
            current_text_lines = []
        else:
            current_text_lines.append(line)

    return data

def load_text_label_datasets(base_path):
    dataset = {'restaurant': {}, 'hotel': {}}
    for domain in ['restaurant', 'hotel']:
        for split in ['train', 'dev', 'test']:
            filename = f"VLSP2018-SA-{domain.capitalize()}-{split}.txt"
            filepath = os.path.join(base_path, filename)
            if os.path.exists(filepath):
                dataset[domain][split] = extract_clean_tokenized_text_and_labels(filepath)
            else:
                print(f"Missing file: {filepath}")
    return dataset


In [5]:
base_folder = "/content/drive/MyDrive/NLP Project"
full_data = load_text_label_datasets(base_folder)

# Example: print first cleaned + tokenized entry
text, labels = full_data['restaurant']['dev'][0]
print("Tokenized Text:", text)
print("Labels:", labels)


Tokenized Text: ﻿ giá con Tu hài to siêu béo siêu ngon nướng mỡ hành thơm_phức béo ngậy Đĩa tu hài đem ra nóng_hổi gắp miếng vào miệng kích_thích vi giác kinh_khủng con to đến mức họ phải cắt ra làm đôi ăn nửa con đầy ý miệng luôn ý Mà giá đó cho con tu hài ngon như_vậy là quá rẻ
Labels: ['FOOD#PRICES, positive', 'FOOD#QUALITY, positive']


In [12]:
import pandas as pd

def dataset_to_dataframe(dataset):
    rows = []
    for domain, splits in dataset.items():
        for split, entries in splits.items():
            for text, labels in entries:
                aspects = []
                opinions = []
                for label in labels:
                    if ',' in label:
                        aspect, opinion = label.rsplit(',', 1)
                        aspects.append(aspect.strip())
                        opinions.append(opinion.strip())

                rows.append({
                    'domain': domain,
                    'split': split,
                    'text': text,
                    'labels': labels,
                    'aspects': aspects,
                    'opinions': opinions
                })
    return pd.DataFrame(rows)

# Convert to DataFrame
df = dataset_to_dataframe(full_data)

# Show the first few rows
df.head(5)
df.to_csv('cleaned_dataset.csv', index=False)



# New Section