In [3]:
import re
import pandas as pd
import fitz
import docx
import csv

In [4]:
def read_docx(file_path):
    doc = docx.Document(file_path)
    text = '\n'.join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
    return text

def extract_questions_answers(text):
    # Regex hỗ trợ: "Câu", "Câu hỏi", "Tình huống", có hoặc không có "Trả lời:", có dấu "." hoặc ":"
    pattern = r'(?:Câu hỏi|Câu|Tình huống|Hỏi|\d+\.\s*Hỏi)\s*\d*\s*[:\.]\s*(.*?)\n(?:Trả lời[:\.]?\s*)?(.*?)(?=\n(?:Câu hỏi|Câu|Tình huống|Hỏi|\d+\.\s*Hỏi)\s*\d*\s*[:\.]|$)'
    matches = re.findall(pattern, text, re.DOTALL)
    return [(q.strip(), a.strip()) for q, a in matches]
# Đọc nội dung PDF thành text
def read_pdf(file_path):
    doc = fitz.open(file_path)
    text = ''
    for page in doc:
        text += page.get_text()
    return text
def export_to_csv(data, file_out):
    with open(file_out, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['question', 'answer'])
        writer.writerows(data)

# Đường dẫn file và xử lý
text = read_docx('./data/origin_data/lao_dong_200.docx')
qa_pairs = extract_questions_answers(text)
export_to_csv(qa_pairs, './data/data_processed/laodong.csv')


### 1.Lao Động

In [5]:
df_laodong=pd.read_csv("../Eval_System/data/data_processed/laodong.csv")

In [6]:
df_laodong.shape

(253, 2)

### 2.An Toàn Giao Thông

In [7]:
# Đường dẫn file và xử lý
text = read_docx('./data/origin_data/ATGT.docx')
qa_pairs = extract_questions_answers(text)
export_to_csv(qa_pairs, './data/data_processed/atgt.csv')


In [8]:
df_atgt=pd.read_csv("./data/data_processed/atgt.csv")

In [9]:
df_atgt.shape

(136, 2)

In [10]:
# Đường dẫn file và xử lý
text = read_pdf('./data/origin_data/tai lieu hoi dap Luat ATGT.pdf')
qa_pairs = extract_questions_answers(text)
export_to_csv(qa_pairs, './data/data_processed/atgt_2.csv')


In [11]:
df_atgt_2=pd.read_csv("./data/data_processed/atgt_2.csv")

In [12]:
df_atgt_2.shape

(40, 2)

In [13]:
df_merge_antt=pd.concat([df_atgt_2,df_atgt],axis=0)

In [14]:
df_merge_antt.shape

(176, 2)

In [15]:
df_merge_antt.columns

Index(['question', 'answer'], dtype='object')

In [16]:
df_merge_antt.to_csv("./data/data_processed/antt_merge.csv",index=False)

### 3.Giáo Dục

In [17]:
# Đường dẫn file và xử lý
text = read_docx('./data/origin_data/giao_duc.docx')
qa_pairs = extract_questions_answers(text)
export_to_csv(qa_pairs, './data/data_processed/giaoduc.csv')


In [18]:
df_giaoduc=pd.read_csv("./data/data_processed/giaoduc.csv")
df_giaoduc.shape

(16, 2)

In [19]:
text=read_pdf("./data/origin_data/GiaoDuc.pdf")

qa_pairs = extract_questions_answers(text)
export_to_csv(qa_pairs, './data/data_processed/giaoduc_2.csv')

In [20]:
df_giaoduc_2=pd.read_csv("./data/data_processed/giaoduc_2.csv")
df_giaoduc_2.shape

(271, 2)

In [21]:
df_merge_giaoduc=pd.concat([df_giaoduc,df_giaoduc_2],axis=0)

In [22]:
df_merge_giaoduc.to_csv("./data/data_processed/giaoduc_merge.csv",index=False)


In [23]:
df_merge_giaoduc.shape

(287, 2)

### 4.Trách nhiệm và hình sự

In [24]:
# Đường dẫn file và xử lý
text = read_docx('./data/origin_data/TrachNhiem_HinhSu.docx')
qa_pairs = extract_questions_answers(text)
export_to_csv(qa_pairs, './data/data_processed/trachnhiem_hinhsu.csv')


In [25]:
df_trachnhiem_hinhsu=pd.read_csv("./data/data_processed/trachnhiem_hinhsu.csv")
df_trachnhiem_hinhsu.shape

(140, 2)

In [26]:
df_honnhan=pd.read_csv("./data/data_processed/honnhan.csv")
df_honnhan.shape

(198, 2)

In [27]:
df_final_law=pd.concat([df_honnhan,df_laodong,df_merge_antt,df_merge_giaoduc,df_trachnhiem_hinhsu],axis=0)

In [28]:
df_final_law.shape

(1054, 2)

In [29]:
df_final_law.columns

Index(['question', 'answer'], dtype='object')

### 5.Câu hỏi khác

In [30]:
# Đường dẫn file và xử lý
text = read_docx('./data/origin_data/Cau_Hoi_Khac.docx')
qa_pairs = extract_questions_answers(text)
export_to_csv(qa_pairs, './data/data_processed/cauhoi_khac.csv')


In [31]:
df_cauhoikhac=pd.read_csv("./data/data_processed/cauhoi_khac.csv")

In [32]:
df_final=pd.concat([df_final_law,df_cauhoikhac],axis=0)

In [33]:
df_final.shape
df_final.to_csv("./data/data_processed/final_data_system.csv",index=False)

In [34]:
print(df_final.shape)

(1204, 2)
