In [208]:
from docx import Document
import numpy as np
from transformers import BertModel, BertTokenizer
import torch 
import json

# Путь к файлу
infile = 'sample4.docx'

# Открываем документ
doc = Document(infile)

# Инициализация токенизатора и модели BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Сбор текста из документа
all_text = []
structured_tables = []  # Список для таблиц

for paragraph in doc.paragraphs:
    text = paragraph.text.strip()
    if text:
        all_text.append(text)

for table in doc.tables:
    table_data = []
    for row in table.rows:
        row_data = [cell.text.strip() for cell in row.cells]
        table_data.append(row_data)
    structured_tables.append({"data": table_data})

    # Конвертируем строки таблиц в текст
    table_text = [" | ".join(row) for row in table_data]
    all_text.extend(table_text)

# Объединяем текст для токенизации
combined_text = "\n".join(all_text)

# Токенизация текста
inputs = tokenizer(combined_text, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Получение эмбеддингов из BERT
with torch.no_grad():
    outputs = model(**inputs)

# Вывод скрытых состояний [CLS]-токена
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()

# Сохранение таблиц без стилей в JSON
with open("tables_structure_no_style.json", "w", encoding="utf-8") as f:
    json.dump(structured_tables, f, ensure_ascii=False, indent=4)

print("Эмбеддинги текста из документа:")
print(combined_text)

Эмбеддинги текста из документа:
Video provides a powerful way to help you prove your point. When you click Online Video, you can paste in the embed code for the video you want to add. You can also type a keyword to search online for the video that best fits your document. To make your document look professionally produced, Word provides header, footer, cover page, and text box designs that complement each other. For example, you can add a matching cover page, header, and sidebar. Click Insert and then choose the elements you want from the different galleries. Themes and styles also help keep your document coordinated. When you click Design and choose a new Theme, the pictures, charts, and SmartArt graphics change to match your new theme. When you apply styles, your headings change to match the new theme. Save time in Word with new buttons that show up where you need them.
To change the way a picture fits in your document, click it and a button for layout options appears next to it. Whe