## **1.Collect data**

In [2]:
import requests
import pandas as pd
import json
from lxml import etree
import numpy as np
import io
import os
import gzip
from bs4 import BeautifulSoup


In [None]:
# create a folder to serve for store data
os.makedirs('data', exist_ok=True)

### **1. Crawl from website Tatoeba**

Source: https://tatoeba.org/

In [None]:

def crawl_data(leng_min=1, page_max=100):
    engs, vies = [], []

    for page in range(1, page_max + 1):
        try:
            url = "https://tatoeba.org/vi/api_v0/search?from=eng&to=vie&trans_filter=limit&trans_to=vie&page={}&word_count_max=&word_count_min={}".format(page, leng_min)
            response = requests.get(url)

            if response.status_code == 200:
              data = response.json()
              for item in data.get('results', []):
                  english = item['text']
                  engs.append(english)

                  translations = item.get('translations', [])
                  vietnamese_translations = []

                  for sublist in translations:
                      for trans in sublist:
                          if trans['lang'] == 'vie':
                              vietnamese_translations.append(trans['text'])

                  if vietnamese_translations:
                      longest_vi = min(vietnamese_translations, key=len)
                      vies.append(longest_vi)

        except Exception as e:
            print(f"Error on page {page}: {e}")

    return engs, vies

In [None]:
def crawl_all(leng_max = 10):
  engs_all, vies_all = [],[]
  for e in range(1, leng_max + 1):
    try:
      engs, vies = crawl_data(leng_min = e, page_max = 100)
      engs_all.extend(engs)
      vies_all.extend(vies)
    except Exception as ex:
      print(f"Error on at least {e} words: {ex}")
  df = pd.DataFrame({'English': engs_all, 'Vietnamese': vies_all})
  return df

df1 = crawl_all(20)
df1.head(10)

Unnamed: 0,English,Vietnamese
0,Hmm?,Hử?
1,Translate.,Dịch.
2,Faster.,Nhanh hơn.
3,Kiss!,Hôn đi!
4,Please...,Đi mà...
...,...,...
15856,My friend from university gave me a Kindle bec...,Một người bạn thời đại học đã cho tôi một cái ...
15857,In all the work I have done as President—every...,Trong tất cả những công việc mà tôi đã làm với...
15858,The 26th of September is the European Day of L...,Ngày 26 tháng chín là ngày Ngôn ngữ của Châu Â...
15859,While the Kabyle people were fighting French c...,Trong lúc những người Kabyle đang chiến đấu vớ...


In [None]:
df1.to_csv("data/tatoeba.csv", index= False)

In [None]:
df1.shape

(15861, 2)

### **2. Download from Opus**

please visit [Opus](https://opus.nlpl.eu/) to understand that in details

In [3]:
# download data
opus_url = "https://object.pouta.csc.fi/OPUS-CCMatrix/v1/xml/en-vi.xml.gz"
response = requests.get(opus_url)
if response.status_code != 200:
    raise Exception(f"Occur an errorr when downloading: {response.status_code}")
compressed_file = io.BytesIO(initial_bytes=response.content)
with gzip.GzipFile(fileobj=compressed_file) as f:
    xml_content = f.read()

: 

In [4]:
# compress file
compressed_file = io.BytesIO(response.content)
with gzip.GzipFile(fileobj=compressed_file) as f:
    xml_content = f.read()


In [1]:
# format
root = etree.fromstring(xml_content)

# Chuyển lại thành string đẹp (pretty-printed)
xml = etree.tostring(root, pretty_print=True, encoding='unicode')

# In vài dòng đầu tiên
lines = xml.strip().split('\n')
for i in range(min(10, len(lines))):
    print(lines[i])


NameError: name 'etree' is not defined

In [None]:
engs, vies = [], []
for s in root.xpath("//s"):
    en = s.xpath(".//seg[@lang='en']")
    vi = s.xpath(".//seg[@lang='vi']")
    if en and vi:
        en_text = en[0].text.strip() if en[0].text else ""
        engs.append(en_text)
        
        vi_text = vi[0].text.strip() if vi[0].text else ""
        vies.append(vi_text)

# In thử 5 cặp song ngữ đầu tiên
for i in range(5):
    print(f"[{i+1}] EN: {engs[i]}\n     VI: {vies[i]}\n")