In [13]:
import re
import io
import nltk
import PyPDF2
import extract_msg
from docx import Document
from email import policy
from email.parser import BytesParser

In [15]:
def preprocess(text: str):
    text = text.lower()
    text = re.sub(r"[^a-zа-яё\s-]", " ", text)
    text = re.sub(r"\s+", " ", text)
    tokens = nltk.word_tokenize(text)
    lemmas = [morph.parse(t)[0].normal_form for t in tokens if t not in stop_words and t.isalpha()]
    return " ".join(lemmas)

def strip_html(html):
    html = re.sub(r'<script.*?>.*?</script>', '', html, flags=re.S)
    html = re.sub(r'<style.*?>.*?</style>', '', html, flags=re.S)
    return re.sub(r'<[^>]+>', ' ', html)

def extract_links(text):
    return re.findall(r'https?://\S+', text)

def extract_text(msg):
    text = ""
    if msg.is_multipart():
        for part in msg.walk():
            ctype = part.get_content_type()
            if ctype == "text/plain":
                text += part.get_payload(decode=True).decode('utf-8', errors='ignore')
            elif ctype == "text/html":
                html = part.get_payload(decode=True).decode('utf-8', errors='ignore')
                text += strip_html(html)
    else:
        text = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
    return text

def extract_text_from_pdf(binary_data):
    reader = PyPDF2.PdfReader(io.BytesIO(binary_data))
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(binary_data):
    buffer = io.BytesIO(binary_data)
    doc = Document(buffer)
    return "\n".join(p.text for p in doc.paragraphs)

def extract_attachments(msg):
    attachments = []
    attachments_text = []
    for part in getattr(msg, 'iter_attachments', lambda: [])():
        filename = part.get_filename()
        payload = part.get_payload(decode=True)
        attachments.append({"filename": filename, "content": payload})
        if filename:
            lower = filename.lower()
            try:
                if lower.endswith(".pdf"):
                    attachments_text.append(extract_text_from_pdf(payload))
                elif lower.endswith(".docx"):
                    attachments_text.append(extract_text_from_docx(payload))
                elif lower.endswith(".txt"):
                    attachments_text.append(payload.decode("utf-8", errors="ignore"))
            except:
                pass
    return attachments, "\n".join(attachments_text)

def parse_eml_or_msg(file):
    text, attachments_text, subject = "", "", ""
    attachments = []

    name = getattr(file, 'name', '')
    name_lower = name.lower() if name else ''
    
    if name_lower.endswith(".eml"):
        msg = BytesParser(policy=policy.default).parse(file)
        subject = msg["Subject"] or ""
        text = extract_text(msg)
        attachments, attachments_text = extract_attachments(msg)
    elif name_lower.endswith(".msg"):
        msg = extract_msg.Message(file)
        subject = msg.subject or ""
        text = msg.body or ""
        attachments_text = ""
    else:
        raise ValueError("Поддерживаются только .eml и .msg")

    links = extract_links(text)
    full_text = (subject + "\n" + text + "\n" + attachments_text).strip()
    return subject, text, attachments_text, attachments, links, full_text
