<a href="https://colab.research.google.com/github/anokhina-rgb/Multilingual-Corpus-for-EU-Studies/blob/main/Plagiarism_Checker_GUI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import docx
import openpyxl
from bs4 import BeautifulSoup
import re
import os
from difflib import SequenceMatcher

class TranslationFileAligner(tk.Tk):
    """
    A GUI application for aligning, formatting translation files, and checking for plagiarism.
    """
    def __init__(self):
        super().__init__()

        self.title("Translation File Aligner and Formatter")
        self.geometry("800x600")
        self.notebook = ttk.Notebook(self)
        self.notebook.pack(fill=tk.BOTH, expand=True)

        self.create_auto_format_tab()
        self.create_align_files_tab()
        self.create_plagiarism_checker_tab() # added plagiarism checker tab

    def create_auto_format_tab(self):
        """
        Creates the tab for automatic file formatting.
        """
        self.auto_format_tab = ttk.Frame(self.notebook)
        self.notebook.add(self.auto_format_tab, text="Auto Format")

        # Input and Output Directories
        input_frame = ttk.Frame(self.auto_format_tab)
        input_frame.pack(pady=10, padx=10, fill=tk.X)
        ttk.Label(input_frame, text="Input Folder:").pack(side=tk.LEFT)
        self.input_folder_path = tk.StringVar()
        ttk.Entry(input_frame, textvariable=self.input_folder_path, width=50).pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
        ttk.Button(input_frame, text="Browse", command=self.select_input_folder).pack(side=tk.LEFT)

        output_frame = ttk.Frame(self.auto_format_tab)
        output_frame.pack(pady=10, padx=10, fill=tk.X)
        ttk.Label(output_frame, text="Output Folder:").pack(side=tk.LEFT)
        self.output_folder_path = tk.StringVar()
        ttk.Entry(output_frame, textvariable=self.output_folder_path, width=50).pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
        ttk.Button(output_frame, text="Browse", command=self.select_output_folder).pack(side=tk.LEFT)

        # Run Conversion Button
        run_button = ttk.Button(self.auto_format_tab, text="Run Conversion", command=self.run_conversion)
        run_button.pack(pady=20)

        # Progress Bar
        self.progress_bar = ttk.Progressbar(self.auto_format_tab, mode='determinate', length=300)
        self.progress_bar.pack(pady=10)
        self.progress_bar['value'] = 0

        # Status Label
        self.status_label = ttk.Label(self.auto_format_tab, text="Ready")
        self.status_label.pack(pady=10)

    def create_align_files_tab(self):
        """
        Creates the tab for aligning two files.
        """
        self.align_files_tab = ttk.Frame(self.notebook)
        self.notebook.add(self.align_files_tab, text="Align Two Files")

        # File Selection
        source_frame = ttk.Frame(self.align_files_tab)
        source_frame.pack(pady=10, padx=10, fill=tk.X)
        ttk.Label(source_frame, text="Source File:").pack(side=tk.LEFT)
        self.source_file_path = tk.StringVar()
        ttk.Entry(source_frame, textvariable=self.source_file_path, width=50).pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
        ttk.Button(source_frame, text="Browse", command=self.select_source_file).pack(side=tk.LEFT)

        target_frame = ttk.Frame(self.align_files_tab)
        target_frame.pack(pady=10, padx=10, fill=tk.X)
        ttk.Label(target_frame, text="Target File:").pack(side=tk.LEFT)
        self.target_file_path = tk.StringVar()
        ttk.Entry(target_frame, textvariable=self.target_file_path, width=50).pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
        ttk.Button(target_frame, text="Browse", command=self.select_target_file).pack(side=tk.LEFT)

        # Alignment and Export Button
        align_button = ttk.Button(self.align_files_tab, text="Align and Export", command=self.align_and_export)
        align_button.pack(pady=20)

        # Status Label for Alignment
        self.align_status_label = ttk.Label(self.align_files_tab, text="Ready")
        self.align_status_label.pack(pady=10)

        # Progress Bar for Alignment
        self.align_progress_bar = ttk.Progressbar(self.align_files_tab, mode='determinate', length=300)
        self.align_progress_bar.pack(pady=10)
        self.align_progress_bar['value'] = 0
        self.align_progress_bar['maximum'] = 0

    def create_plagiarism_checker_tab(self):
        """
        Creates the tab for plagiarism checking.
        """
        self.plagiarism_checker_tab = ttk.Frame(self.notebook)
        self.notebook.add(self.plagiarism_checker_tab, text="Plagiarism Checker")

        # File Selection
        text1_frame = ttk.Frame(self.plagiarism_checker_tab)
        text1_frame.pack(pady=10, padx=10, fill=tk.X)
        ttk.Label(text1_frame, text="Text 1 File:").pack(side=tk.LEFT)
        self.text1_path = tk.StringVar()
        ttk.Entry(text1_frame, textvariable=self.text1_path, width=50).pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
        ttk.Button(text1_frame, text="Browse", command=self.select_text1_file).pack(side=tk.LEFT)

        text2_frame = ttk.Frame(self.plagiarism_checker_tab)
        text2_frame.pack(pady=10, padx=10, fill=tk.X)
        ttk.Label(text2_frame, text="Text 2 File:").pack(side=tk.LEFT)
        self.text2_path = tk.StringVar()
        ttk.Entry(text2_frame, textvariable=self.text2_path, width=50).pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
        ttk.Button(text2_frame, text="Browse", command=self.select_text2_file).pack(side=tk.LEFT)

        # Check Plagiarism Button
        check_button = ttk.Button(self.plagiarism_checker_tab, text="Check Plagiarism", command=self.check_plagiarism)
        check_button.pack(pady=20)

        # Result Label
        self.plagiarism_result_label = ttk.Label(self.plagiarism_checker_tab, text="Ready")
        self.plagiarism_result_label.pack(pady=10)

        # Progress Bar
        self.plagiarism_progress_bar = ttk.Progressbar(self.plagiarism_checker_tab, mode='determinate', length=300)
        self.plagiarism_progress_bar.pack(pady=10)
        self.plagiarism_progress_bar['value'] = 0
        self.plagiarism_progress_bar['maximum'] = 0

    def select_input_folder(self):
        """
        Opens a dialog to select the input folder.
        """
        folder_path = filedialog.askdirectory()
        self.input_folder_path.set(folder_path)

    def select_output_folder(self):
        """
        Opens a dialog to select the output folder.
        """
        folder_path = filedialog.askdirectory()
        self.output_folder_path.set(folder_path)

    def select_source_file(self):
        """
        Opens a dialog to select the source file.
        """
        file_path = filedialog.askopenfilename(filetypes=[("All Files", "*.*")])
        self.source_file_path.set(file_path)

    def select_target_file(self):
        """
        Opens a dialog to select the target file.
        """
        file_path = filedialog.askopenfilename(filetypes=[("All Files", "*.*")])
        self.target_file_path.set(file_path)

    def select_text1_file(self):
        """
        Opens a dialog to select the first text file.
        """
        file_path = filedialog.askopenfilename(filetypes=[("All Files", "*.*")])
        self.text1_path.set(file_path)

    def select_text2_file(self):
        """
        Opens a dialog to select the second text file.
        """
        file_path = filedialog.askopenfilename(filetypes=[("All Files", "*.*")])
        self.text2_path.set(file_path)

    def run_conversion(self):
        """
        Runs the batch conversion process.
        """
        input_folder = self.input_folder_path.get()
        output_folder = self.output_folder_path.get()

        if not input_folder or not output_folder:
            messagebox.showerror("Error", "Please select both input and output folders.")
            return

        files = [f for f in os.listdir(input_folder) if f.endswith(('.docx', '.txt', '.pdf'))]
        total_files = len(files)
        self.progress_bar['maximum'] = total_files
        self.progress_bar['value'] = 0
        self.status_label.config(text=f"Processing 0 of {total_files} files...")

        for i, file_name in enumerate(files):
            input_file_path = os.path.join(input_folder, file_name)
            self.process_file(input_file_path, output_folder)
            self.progress_bar['value'] = i + 1
            self.status_label.config(text=f"Processed {i + 1} of {total_files} files...")
            self.update_idletasks()

        self.status_label.config(text="Conversion complete.")
        messagebox.showinfo("Conversion Complete", "All files have been processed.")
        self.progress_bar['value'] = 0

    def process_file(self, input_file_path, output_folder):
        """
        Processes a single file, applying formatting and saving the output.

        Args:
            input_file_path (str): Path to the input file.
            output_folder (str): Path to the output folder.
        """
        if input_file_path.endswith('.docx'):
            text = self.read_docx(input_file_path)
        elif input_file_path.endswith('.txt'):
            text = self.read_txt(input_file_path)
        elif input_file_path.endswith('.pdf'):
            text = self.read_pdf(input_file_path)
        else:
            return

        formatted_text = self.format_text(text)
        base_name = os.path.splitext(os.path.basename(input_file_path))[0]
        docx_path = os.path.join(output_folder, f"{base_name}_formatted.docx")
        txt_path = os.path.join(output_folder, f"{base_name}_formatted.txt")
        tmx_path = os.path.join(output_folder, f"{base_name}_formatted.tmx")
        xlsx_path = os.path.join(output_folder, f"{base_name}_formatted.xlsx")

        self.save_docx(formatted_text, docx_path)
        self.save_txt(formatted_text, txt_path)
        self.save_tmx(formatted_text, formatted_text, tmx_path)
        self.save_xlsx(formatted_text, formatted_text, xlsx_path)

    def read_docx(self, file_path):
        """
        Reads text from a DOCX file.
        """
        doc = docx.Document(file_path)
        full_text = []
        for paragraph in doc.paragraphs:
            full_text.append(paragraph.text)
        return '\n'.join(full_text)

    def read_txt(self, file_path):
        """
        Reads text from a TXT file.
        """
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()

    def read_pdf(self, file_path):
        """
        Reads text from a PDF file.
        Requires PyPDF2.
        """
        try:
            import PyPDF2
            with open(file_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() or ""
                return text
        except ImportError:
            messagebox.showerror("Error", "PyPDF2 is required to read PDF files. Please install it (pip install PyPDF2).")
            return ""

    def format_text(self, text):
        """
        Applies the specified formatting rules to the text.
        """
        try:
            text = BeautifulSoup(text, "html.parser").get_text()
        except:
            pass
        text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
        text = re.sub(r"\[(.*?)\]", r"「\1」", text)
        text = re.sub(r"\((.*?)\)", r"（\1）", text)
        text = text.replace("--", "—")
        text = re.sub(r"(「)", r"\n\1", text)
        text = re.sub(r"(」)", r"\n\1", text)
        text = re.sub(r"\n{3,}", "\n\n", text)
        text = text.strip()
        return text

    def save_docx(self, text, file_path):
        """
        Saves text to a DOCX file.
        """
        document = docx.Document()
        for paragraph in text.split('\n\n'):
            document.add_paragraph(paragraph)
        document.save(file_path)

    def save_txt(self, text, file_path):
        """
        Saves text to a TXT file.
        """
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(text)

    def save_tmx(self, source_text, target_text, file_path):
        """
        Saves aligned text to a TMX file.
        """
        tmx_content = f"""<?xml version="1.0" encoding="UTF-8"?>
<tmx version="1.4">
  <header creationtool="TranslationFileAligner" datatype="plaintext" segtype="sentence" o-tmf="UTF-8"/>
  <body>
"""
        source_segments = source_text.split('\n\n')
        target_segments = target_text.split('\n\n')
        min_segments = min(len(source_segments), len(target_segments))

        if len(source_segments) != len(target_segments):
            print(f"Warning: Source and target texts have different number of segments in {file_path}.  Attempting to align the first {min_segments} segments.")

        for i in range(min_segments):
            source_segment = source_segments[i].replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
            target_segment = target_segments[i].replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
            tmx_content += f"""    <tu>
      <tuv xml:lang="en">
        <seg>{source_segment}</seg>
      </tuv>
      <tuv xml:lang="uk">
        <seg>{target_segment}</seg>
      </tuv>
    </tu>
"""
        tmx_content += """  </body>
</tmx>
"""
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(tmx_content)

    def save_xlsx(self, source_text, target_text, file_path):
        """
        Saves aligned text to an Excel file.
        """
        workbook = openpyxl.Workbook()
        sheet = workbook.active
        sheet.title = "Alignment"
        sheet.append(["Source", "Target"])

        source_segments = source_text.split('\n\n')
        target_segments = target_text.split('\n\n')
        min_segments = min(len(source_segments), len(target_segments))

        for i in range(min_segments):
            sheet.append([source_segments[i], target_segments[i]])
        workbook.save(file_path)

    def align_and_export(self):
        """
        Aligns the selected source and target files and exports the alignment.
        """
        source_file = self.source_file_path.get()
        target_file = self.target_file_path.get()

        if not source_file or not target_file:
            messagebox.showerror("Error", "Please select both source and target files.")
            self.align_status_label.config(text="Error: Please select both source and target files.")
            return

        try:
            source_text = self.read_file_content(source_file)
            target_text = self.read_file_content(target_file)
        except Exception as e:
            messagebox.showerror("Error", f"Error reading files: {e}")
            self.align_status_label.config(text=f"Error: {e}")
            return

        # Perform the alignment (using sequence matching)
        source_segments = self.get_segments(source_text)
        target_segments = self.get_segments(target_text)

        self.align_progress_bar['maximum'] = len(source_segments) + len(target_segments)
        self.align_progress_bar['value'] = 0
        self.align_status_label.config(text=f"Aligning 0 of {self.align_progress_bar['maximum']} segments...")
        self.update_idletasks()

        aligned_pairs = self.align_segments(source_segments, target_segments)

        # Export the aligned data
        base_name = "aligned"
        tmx_path = f"{base_name}.tmx"
        xlsx_path = f"{base_name}.xlsx"

        self.save_tmx(source_text, target_text, tmx_path) #changed from aligned_pairs to source_text, target_text
        self.save_xlsx(source_text, target_text, xlsx_path) #changed from aligned_pairs to source_text, target_text

        self.align_status_label.config(text=f"Alignment complete. Exported to {tmx_path} and {xlsx_path}")
        messagebox.showinfo("Alignment Complete", f"Files aligned and exported to {tmx_path} and {xlsx_path}")
        self.align_progress_bar['value'] = 0

    def read_file_content(self, file_path):
        """
        Reads the content of a file, handling different file types.

        Args:
            file_path (str): Path to the file.

        Returns:
            str: The content of the file.
        """
        if file_path.endswith('.docx'):
            return self.read_docx(file_path)
        elif file_path.endswith('.txt'):
            return self.read_txt(file_path)
        elif file_path.endswith('.pdf'):
            return self.read_pdf(file_path)
        else:
            raise ValueError(f"Unsupported file type: {file_path}")

    def get_segments(self, text):
        """
        Splits the text into segments (paragraphs or sentences).
        """
        segments = text.split('\n\n')
        return segments

    def align_segments(self, source_segments, target_segments):
        """
        Aligns source and target segments based on similarity.
        """
        aligned_pairs = []
        source_indices = list(range(len(source_segments)))
        target_indices = list(range(len(target_segments)))

        while source_indices and target_indices:
            best_match_score = 0
            best_source_index = None
            best_target_index = None

            for i in source_indices:
                for j in target_indices:
                    similarity = SequenceMatcher(None, source_segments[i], target_segments[j]).ratio()
                    self.align_progress_bar['value'] += 1
                    self.align_status_label.config(text=f"Aligning {self.align_progress_bar['value']} of {self.align_progress_bar['maximum']} segments...")
                    self.update_idletasks()
                    if similarity > best_match_score:
                        best_match_score = similarity
                        best_source_index = i
                        best_target_index = j

            if best_source_index is not None and best_target_index is not None and best_match_score > 0.2:
                aligned_pairs.append((source_segments[best_source_index], target_segments[best_target_index]))
                source_indices.remove(best_source_index)
                target_indices.remove(best_target_index)
            else:
                if source_indices:
                  aligned_pairs.append((source_segments[source_indices.pop(0)], ""))
                if target_indices:
                   aligned_pairs.append(("", target_segments[target_indices.pop(0)]))
        while source_indices:
             aligned_pairs.append((source_segments[source_indices.pop(0)], ""))
        while target_indices:
             aligned_pairs.append(("", target_segments[target_indices.pop(0)]))
        return aligned_pairs

    def check_plagiarism(self):
        """
        Checks for plagiarism between two text files.
        """
        text1_file = self.text1_path.get()
        text2_file = self.text2_path.get()

        if not text1_file or not text2_file:
            messagebox.showerror("Error", "Please select both text files.")
            self.plagiarism_result_label.config(text="Error: Please select both text files.")
            return

        try:
            text1_content = self.read_file_content(text1_file)
            text2_content = self.read_file_content(text2_file)
        except Exception as e:
            messagebox.showerror("Error", f"Error reading files: {e}")
            self.plagiarism_result_label.config(text=f"Error: {e}")
            return

        # Perform plagiarism check (using sequence matching)
        similarity = SequenceMatcher(None, text1_content, text2_content).ratio()
        self.plagiarism_progress_bar['value'] = 100
        self.plagiarism_result_label.config(text=f"Similarity: {similarity:.2%}")
        messagebox.showinfo("Plagiarism Check Result", f"Similarity: {similarity:.2%}")
        self.plagiarism_progress_bar['value'] = 0

if __name__ == "__main__":
    app = TranslationFileAligner()
    app.mainloop()