In [None]:
import docx
import re
import json
import os
import torch
from transformers import BertTokenizerFast, BertForTokenClassification

# Initialize counters for unique identifiers
artifact_counter = 1
artifact_type_counter = 1
painter_counter = 1
location_counter = 1
collection_counter = 1
image_counter = 1
chapter_counter = 1

# Dictionaries to track unique entries
artifact_type_dict = {}
painter_dict = {}
location_dict = {}
collection_dict = {}
chapter_dict = {}

# Load the trained model and tokenizer
model_directory = "/content/drive/MyDrive/Trendall Project Files/Models"
tokenizer = BertTokenizerFast.from_pretrained(model_directory)
model = BertForTokenClassification.from_pretrained(model_directory)

def extract_chapter_names(full_text):
    """
    Extract chapter names from the full text.

    Parameters:
    full_text (str): The complete text from the document.

    Returns:
    list: A list of chapter names found in the text.
    """
    pattern = re.compile(r"CHAPTER\s+\d+\..*")
    chapter_names = pattern.findall(full_text)
    return chapter_names

def extract_painter_names(text):
    """
    Extract painter names from the text.

    Parameters:
    text (str): Text from which to extract painter names.

    Returns:
    list: A list of painter names.
    """
    pattern = re.compile(r"(?<=\n\n)([A-Z0-9\s\-/'’:]+)(?=\n\n)")
    painter_names = [name for name in pattern.findall(text) if name.isupper()]
    return painter_names

def extract_artifact_types(text):
    """
    Extract artifact types from the text.

    Parameters:
    text (str): Text from which to extract artifact types.

    Returns:
    set: A set of artifact types.
    """
    pattern = re.compile(r"(?<=\n\n)([A-Z][a-z\s\-\(\),]*(?:\([a-z0-9\s]+\))?)(?=\n\n)")
    artifact_types = set(artifact for artifact in pattern.findall(text) if len(artifact) < 40)
    return artifact_types

def extract_dimensions(text):
    """
    Extract dimensions (height and diameter) from the text.

    Parameters:
    text (str): Text from which to extract dimensions.

    Returns:
    list: A list of dictionaries containing height and diameter.
    """
    pattern = re.compile(r'(Ht\.|ht\.)\s*(\d+(?:-\d+)?)(?:,\s*(diam\.|Diam\.)\s*(\d+(?:-\d+)?(?:/\d+(?:-\d+)?)?)?)?')
    matches = pattern.findall(text)
    results = []
    for match in matches:
        height_label, height_value, diameter_label, diameter_value = match
        result = {}
        result[height_label.strip('.')] = height_value
        if diameter_label and diameter_value:
            result[diameter_label.strip('.')] = diameter_value
        results.append(result)
    return results

def extract_collection_location(text):
    """
    Extract collection and location from the text.

    Parameters:
    text (str): Text from which to extract collection and location.

    Returns:
    tuple: A tuple containing the collection and location.
    """
    collection_pattern = re.compile(r'^([\w\s]+(?:\([\w\s\d.,]+\))?)(?=[,\.]\s|\sfrom)')
    location_pattern = re.compile(r'from ([\w\s]+)')
    collection_match = collection_pattern.search(text)
    location_match = location_pattern.search(text)
    collection = collection_match.group(1).strip('., ') if collection_match else None
    location = location_match.group(1) if location_match else None
    return collection, location

def extract_artifact_entries(text):
    """
    Extract artifact entries from the text.

    Parameters:
    text (str): Text from which to extract artifact entries.

    Returns:
    list: A list of artifact entries.
    """
    pattern = re.compile(r'^(\*?\d+[\w/]*)$')
    lines = text.splitlines()

    collecting = False
    collected_lines = []
    artifact_entries = []

    for i in range(len(lines)):
        line = lines[i]
        stripped_line = line.strip()

        if not stripped_line:
            if collecting and collected_lines:
                artifact_entries.append("\n".join(collected_lines))
                collected_lines = []
                collecting = False
            if i + 1 < len(lines):
                next_line = lines[i + 1].strip()
                if pattern.match(next_line):
                    collecting = True
                    continue
        elif collecting:
            collected_lines.append(line)

    if collecting and collected_lines:
        artifact_entries.append("\n".join(collected_lines))

    return artifact_entries

def read_artifact_files(folder_path):
    """
    Read artifact files from a folder.

    Parameters:
    folder_path (str): Path to the folder containing artifact files.

    Returns:
    dict: A dictionary mapping file content to file paths.
    """
    artifact_files = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                file_content = file.read().strip()
                artifact_files[file_content] = file_path
    return artifact_files

def predict_entities(text):
    """
    Predict entities in the text using a pre-trained BERT model.

    Parameters:
    text (str): Text to predict entities.

    Returns:
    tuple: A tuple containing tokens and their corresponding labels.
    """
    words = text.split()
    tokenized_inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**tokenized_inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids'][0])
    labels = [model.config.id2label[pred.item()] for pred in predictions[0]]
    return tokens, labels

def extract_references_and_image_descriptions(text):
    """
    Extract references and image descriptions from the text.

    Parameters:
    text (str): Text to extract references and image descriptions.

    Returns:
    tuple: A tuple containing references and image descriptions as strings.
    """
    tokens, labels = predict_entities(text)
    references = []
    image_descriptions = []
    current_ref = []
    current_img_desc = []
    for token, label in zip(tokens, labels):
        if label == "LABEL_1":
            if current_ref:
                references.append(tokenizer.convert_tokens_to_string(current_ref))
                current_ref = []
            if token not in ["[CLS]", "[SEP]"]:
                current_img_desc.append(token)
        elif label == "LABEL_2":
            if current_img_desc:
                image_descriptions.append(tokenizer.convert_tokens_to_string(current_img_desc))
                current_img_desc = []
            if token not in ["[CLS]", "[SEP]"]:
                current_ref.append(token)
    if current_ref:
        references.append(tokenizer.convert_tokens_to_string(current_ref))
    if current_img_desc:
        image_descriptions.append(tokenizer.convert_tokens_to_string(current_img_desc))

    references_str = " ".join(references)
    image_descriptions_str = " ".join(image_descriptions)
    return references_str, image_descriptions_str

def extract_artifacts_from_text(doc_path, artifact_files):
    """
    Extract artifacts from a document and structure them into a database format.

    Parameters:
    doc_path (str): Path to the document containing artifact data.
    artifact_files (dict): Dictionary mapping file content to file paths.

    Returns:
    dict: A dictionary representing the structured database.
    """
    global artifact_counter, artifact_type_counter, painter_counter, location_counter, collection_counter, image_counter, chapter_counter

    if not os.path.exists(doc_path):
        raise FileNotFoundError(f"File not found: {doc_path}")

    doc = docx.Document(doc_path)
    full_text = "\n".join(para.text for para in doc.paragraphs)
    chapter_names = extract_chapter_names(full_text)

    artifacts = []
    types_of_artifacts = []
    painters = []
    locations = []
    collections = []
    images = []
    chapters = []
    artifact_type_relations = []
    artifact_painter_relations = []
    artifact_location_relations = []
    artifact_collection_relations = []
    artifact_chapter_relations = []
    artifact_image_relations = []

    for chapter in chapter_names:
        chapter_start_idx = full_text.index(chapter)
        next_chapter_idx = len(full_text)

        for next_chapter in chapter_names[chapter_names.index(chapter) + 1:]:
            try:
                next_chapter_idx = full_text.index(next_chapter, chapter_start_idx + 1)
                break
            except ValueError:
                continue

        chapter_text = full_text[chapter_start_idx:next_chapter_idx]
        chapter_name = chapter.split('\n')[0]

        painter_names = extract_painter_names(chapter_text)
        for painter in painter_names:
            painter_start_idx = chapter_text.index(painter)
            next_painter_idx = len(chapter_text)

            for next_painter in painter_names[painter_names.index(painter) + 1:]:
                try:
                    next_painter_idx = chapter_text.index(next_painter, painter_start_idx + 1)
                    break
                except ValueError:
                    continue

            painter_text = chapter_text[painter_start_idx:next_painter_idx]

            artifact_types = extract_artifact_types(painter_text)
            for artifact_type in artifact_types:
                artifact_type_start_idx = painter_text.index(artifact_type)
                next_artifact_type_idx = len(painter_text)

                for next_artifact_type in artifact_types:
                    try:
                        next_artifact_type_idx = painter_text.index(next_artifact_type, artifact_type_start_idx + len(artifact_type))
                        break
                    except ValueError:
                        continue

                artifact_entries_text = painter_text[artifact_type_start_idx:next_artifact_type_idx]
                artifact_entries = extract_artifact_entries(artifact_entries_text)

                for entry in artifact_entries:
                    entry = entry.strip()
                    if entry:
                        # Check if the artifact entry matches any file content in the dictionary
                        if entry in artifact_files:
                            file_path = artifact_files.pop(entry)  # Remove the matched file to prevent duplicates
                            dimensions = extract_dimensions(entry)
                            collection, location = extract_collection_location(entry)
                            height = dimensions[0].get('Ht') if dimensions and 'Ht' in dimensions[0] else None
                            diameter = dimensions[0].get('diam') if dimensions and 'diam' in dimensions[0] else None

                            references, image_descriptions = extract_references_and_image_descriptions(entry)

                            new_artifact = {
                                "artifact_id": artifact_counter,
                                "fabric_name": "Paestum",
                                "height": height,
                                "diameter": diameter,
                                "publications": image_descriptions,
                                "image_description": references,
                                "date": None
                            }

                            artifacts.append(new_artifact)

                            # Append to respective collections and create relations
                            if artifact_type:
                                if artifact_type not in artifact_type_dict:
                                    artifact_type_dict[artifact_type] = artifact_type_counter
                                    types_of_artifacts.append({
                                        "artifact_type_id": artifact_type_counter,
                                        "artifact_type_name": artifact_type
                                    })
                                    artifact_type_counter += 1
                                artifact_type_relations.append({
                                    "artifact_id": artifact_counter,
                                    "artifact_type_id": artifact_type_dict[artifact_type]
                                })

                            if painter:
                                if painter not in painter_dict:
                                    painter_dict[painter] = painter_counter
                                    painters.append({
                                        "painter_id": painter_counter,
                                        "painter_name": painter
                                    })
                                    painter_counter += 1
                                artifact_painter_relations.append({
                                    "artifact_id": artifact_counter,
                                    "painter_id": painter_dict[painter]
                                })

                            if location:
                                if location not in location_dict:
                                    location_dict[location] = location_counter
                                    locations.append({
                                        "location_id": location_counter,
                                        "location_name": location
                                    })
                                    location_counter += 1
                                artifact_location_relations.append({
                                    "artifact_id": artifact_counter,
                                    "location_id": location_dict[location]
                                })

                            if collection:
                                if collection not in collection_dict:
                                    collection_dict[collection] = collection_counter
                                    collections.append({
                                        "collection_id": collection_counter,
                                        "collection_name": collection
                                    })
                                    collection_counter += 1
                                artifact_collection_relations.append({
                                    "artifact_id": artifact_counter,
                                    "collection_id": collection_dict[collection]
                                })

                            if chapter_name:
                                if chapter_name not in chapter_dict:
                                    chapter_dict[chapter_name] = chapter_counter
                                    chapters.append({
                                        "chapter_id": chapter_counter,
                                        "chapter_name": chapter_name
                                    })
                                    chapter_counter += 1
                                artifact_chapter_relations.append({
                                    "artifact_id": artifact_counter,
                                    "chapter_id": chapter_dict[chapter_name]
                                })

                            # Handle images
                            image_path = file_path.replace('.txt', '.jpg')  # Assuming image files have the same base name but with .jpg extension
                            if os.path.exists(image_path):
                                images.append({
                                    "image_id": image_counter,
                                    "artifact_id": artifact_counter,  # Assuming each artifact has a unique image
                                    "image_path": image_path
                                })
                                artifact_image_relations.append({
                                    "artifact_id": artifact_counter,
                                    "image_id": image_counter
                                })
                                image_counter += 1

                            artifact_counter += 1

    database = {
        "Artifacts": artifacts,
        "TypeOfArtifact": types_of_artifacts,
        "Painter": painters,
        "Locations": locations,
        "Collection": collections,
        "Images": images,
        "Chapters": chapters,
        "Artifact_TypeOfArtifact": artifact_type_relations,
        "Artifact_Painter": artifact_painter_relations,
        "Artifact_Provenances": artifact_location_relations,
        "Artifact_Collection": artifact_collection_relations,
        "Artifact_Chapter": artifact_chapter_relations,
        "Artifact_Image": artifact_image_relations
    }

    return database

# Ensure the file paths are correct
doc_path = "/content/Finalised Text Extraction.docx"
folder_path = "/content/drive/MyDrive/Trendall Project Files/Trendall Project/AllFiles"  # Replace with your actual folder path

# Read all artifact files into memory
artifact_files = read_artifact_files(folder_path)

# Extract artifacts and structure them in the database format
database = extract_artifacts_from_text(doc_path, artifact_files)

# Convert database to JSON format and save to file
with open("DataForMongoDB.json", "w") as json_file:
    json.dump(database, json_file, indent=4)

print("Database data saved to DataForMongoDB.json")
