In [11]:
import os
print("Praegune töökaust:", os.getcwd())


Praegune töökaust: C:\Users\Kasutaja\Desktop\keeletehnoloogia_2024\eksam2


In [13]:
os.chdir("C:\\Users\\Kasutaja\\Desktop\\helle")
print("Uus töökaust:", os.getcwd())

Uus töökaust: C:\Users\Kasutaja\Desktop\helle


In [None]:
import os
import json
import csv
import base64
import logging
import google.generativeai as genai
from datetime import datetime
from typing import Dict, List, Tuple
import io

# Set your API key as an environment variable (or directly in code for testing ONLY)
os.environ["GOOGLE_API_KEY"] = "my-api-key-here"  # Replace with your actual key

# Configure the API client
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

class DictionaryCorrector:
    def __init__(self):
        self.setup_logging()
        self.system_prompt = """You are an expert in historical linguistics and lexicography, specifically focused on correcting entries from a 1732 Estonian-German dictionary. Compare the CSV entries with the original dictionary image and identify any discrepancies or errors. Focus on:

1. Spelling accuracy (including special characters and diacritics) - do not insert the letter "õ" where it is not originally present.
2. Completeness of entries
3. Correct segmentation of information across fields
4. Proper handling of examples and their translations
5. Accurate recording of grammatical information
6. Do not alter letter shapes - retain the original letter forms as recorded in the dictionary.

For each correction:
1. Document the original entry
2. Explain the error found
3. Provide the corrected version
4. Note the confidence level of the correction (High/Medium/Low)"""

    def setup_logging(self):
        log_format = '%(asctime)s - %(levelname)s\nLocation: %(pathname)s:%(lineno)d\nMessage: %(message)s\n'
        logging.basicConfig(level=logging.INFO, format=log_format, handlers=[logging.FileHandler('correction-log.log'), logging.StreamHandler()])
        self.logger = logging.getLogger(__name__)

    def get_entries_for_page(self, entries: List[Dict], png_filename: str) -> List[Dict]:
        try:
            page_str = png_filename.split('_')[-1].replace('.png', '')
            if not page_str.isdigit():
                raise ValueError(f"Invalid page number format in filename: {png_filename}")
            page_num = int(page_str)

            avg_entries_per_page = 50
            overlap = 15

            start_idx = max(0, (page_num - 1) * avg_entries_per_page)
            end_idx = min(len(entries), page_num * avg_entries_per_page + overlap)

            page_entries = entries[start_idx:end_idx]

            if page_entries:
                self.logger.info(f"Page {page_num}: Processing entries {start_idx} to {end_idx}\nFirst entry: {page_entries[0]['estonian_headword']}\nLast entry: {page_entries[-1]['estonian_headword']}")
            else:
                self.logger.warning(f"No entries found for page {page_num}")

            return page_entries

        except Exception as e:
            self.logger.error(f"Error determining page entries: {str(e)}")
            return entries

    def read_csv(self, filepath: str) -> List[Dict]:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                return list(reader)
        except Exception as e:
            self.logger.error(f"Error reading CSV file: {str(e)}")
            raise

    def encode_image(self, image_path: str) -> str:
        try:
            with open(image_path, 'rb') as image_file:
                return base64.b64encode(image_file.read()).decode('utf-8')
        except Exception as e:
            self.logger.error(f"Error encoding image: {str(e)}")
            raise

    def compare_entries(self, csv_entries: List[Dict], image_path: str) -> Tuple[List[Dict], List[Dict], List[Dict]]:
        """Compare CSV entries with dictionary page and identify corrections."""
        try:
            base64_image = self.encode_image(image_path)

            # Construct the comparison prompt
            comparison_prompt = f"""{self.system_prompt}\n\n
            Please compare these CSV entries with the dictionary page image and identify any discrepancies. 
            If entries are missing, add them to the list in the format provided.\n\nCSV Entries:\n
            {json.dumps(csv_entries, indent=2, ensure_ascii=False)}"""

            # Create a GenerativeModel instance
            model = genai.GenerativeModel("models/gemini-2.0-flash-exp")

            # Start a ChatSession
            chat = model.start_chat()

            # Send the message and get the response
            response = chat.send_message(content={"text": comparison_prompt})

            # Process the response
            if response and response.text:
                return self.parse_gemini_response(response.text)
            else:
                self.logger.error("API request failed: No response or no text in response.")
                raise Exception("API request failed: No response or no text in response.")

        except Exception as e:
            self.logger.error(f"Error in compare_entries: {str(e)}")
            raise











    def parse_gemini_response(self, response: str) -> Tuple[List[Dict], List[Dict], List[Dict]]:
        try:
            result = json.load(io.StringIO(response))
        except json.JSONDecodeError as e:
            self.logger.warning(f"Failed to parse full JSON: {e}")
            json_start = response.find('{')
            json_end = response.rfind('}') + 1
            if json_start != -1 and json_end > json_start:
                try:
                    result = json.load(io.StringIO(response[json_start:json_end]))
                    self.logger.info(f"Extracted JSON: {result}")
                except json.JSONDecodeError as e:
                    self.logger.error(f"Failed to extract and parse JSON: {e}")
                    return [], [], []
            else:
                self.logger.error("No JSON found in response")
                return [], [], []
    
        corrections = result.get('corrections', [])
        new_entries = result.get('new_entries', [])
        uncertain = result.get('uncertain', [])
        return corrections, new_entries, uncertain

    except Exception as e:
        self.logger.error(f"Error parsing Gemini response: {str(e)}")
        return [], [], []

    def apply_corrections(self, original_entries: List[Dict], corrections: List[Dict]) -> List[Dict]:
        corrected_entries = original_entries.copy()
        changes = []

        for correction in corrections:
            entry_index = next((i for i, entry in enumerate(corrected_entries) if entry['estonian_headword'] == correction['original_headword']), None)

            if entry_index is not None:
                old_entry = corrected_entries[entry_index].copy()
                corrected_entries[entry_index].update(correction['corrected_entry'])
                changes.append({'original': old_entry, 'corrected': corrected_entries[entry_index], 'confidence': correction.get('confidence', 'Unknown')})

        self.log_changes(changes)
        return corrected_entries

    def apply_new_entries(self, original_entries: List[Dict], new_entries: List[Dict]) -> List[Dict]:
        for new_entry in new_entries:
            original_entries.append(new_entry['new_entry'])
        return original_entries

    def log_changes(self, changes: List[Dict]):
        for change in changes:
            self.logger.info(f"\nCorrection applied:\nOriginal entry: {json.dumps(change['original'], ensure_ascii=False)}\nCorrected entry: {json.dumps(change['corrected'], ensure_ascii=False)}\nConfidence: {change['confidence']}\n")

    def process_dictionary(self, csv_path: str, png_folder: str, output_csv: str):
        try:
            entries = self.read_csv(csv_path)
            png_files = sorted([f for f in os.listdir(png_folder) if f.lower().endswith('.png')])

            for png_file in png_files:
                image_path = os.path.join(png_folder, png_file)
                page_entries = self.get_entries_for_page(entries, png_file)
                corrections, new_entries, uncertain = self.compare_entries(page_entries, image_path)

                entries = self.apply_corrections(entries, corrections)
                entries = self.apply_new_entries(entries, new_entries)

                for entry in uncertain:
                    self.logger.warning(f"Uncertain entry needs review: {entry}")

            self.write_csv(entries, output_csv)

        except Exception as e:
            self.logger.error(f"Error processing dictionary: {str(e)}")
            raise

    def write_csv(self, entries: List[Dict], output_path: str):
        try:
            with open(output_path, 'w', newline='', encoding='utf-8') as f:
                if entries:
                    writer = csv.DictWriter(f, fieldnames=entries[0].keys())
                    writer.writeheader()
                    writer.writerows(entries)
        except Exception as e:
            self.logger.error(f"Error writing CSV file: {str(e)}")
            raise

def main():
    csv_path = "CSV-input.csv"
    png_folder = "Helle-12-PNG"  # Make sure this folder exists and contains PNGs
    output_csv = "output-Gem1-mj.csv"

    corrector = DictionaryCorrector()
    corrector.process_dictionary(csv_path, png_folder, output_csv)

if __name__ == "__main__":
    main()