### Comparar con DIFFLIB

In [1]:
import PyPDF2
import difflib
import re

def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def normalize_text(text):
    # Remueve caracteres especiales, extrae palabras y normaliza espacios
    text = re.sub(r'[\W_]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower()

def compare_texts(text1, text2):
    d = difflib.Differ()
    diff = list(d.compare(text1.splitlines(), text2.splitlines()))
    
    # Calculate similarity
    matcher = difflib.SequenceMatcher(None, text1, text2)
    similarity = matcher.ratio() * 100

    # Extract differences: + (added in IA), - (removed in IA)
    additions = [line for line in diff if line.startswith('+ ')]
    
    return additions, similarity

def save_differences(additions, similarity, output_path):
    with open(output_path, 'w') as file:
        file.write(f"Similarity: {similarity:.2f}%\n")
        file.write("Additions:\n")
        for line in additions:
            file.write(f"{line}\n")

In [2]:
# Rutas a los archivos PDF
official_pdf = "transcripcion/dinaboluarte/2023_transcripcion_literal.pdf"
transcribed_pdf = "transcripcion/dinaboluarte/2023_transcripcion_ia.pdf"
output_txt = "transcripcion/dinaboluarte/diff_output.txt"

# Read the PDFs
official_text = read_pdf(official_pdf)
transcribed_text = read_pdf(transcribed_pdf)

# Normalize the texts
normalized_official_text = normalize_text(official_text)
normalized_transcribed_text = normalize_text(transcribed_text)

# Compare the texts
differences, similarity = compare_texts(official_text, transcribed_text)

# Compare the texts
additions, similarity = compare_texts(normalized_official_text, normalized_transcribed_text)

# Save the differences and similarity to a txt file
save_differences(additions, similarity, output_txt)

print(f"Comparison complete. Results saved to {output_txt}")

Comparison complete. Results saved to transcripcion/dinaboluarte/diff_output.txt


### Comparar con SpaCy

In [30]:
import PyPDF2
import spacy
from difflib import SequenceMatcher
import re

# Load Spacy model
nlp = spacy.load('es_core_news_sm')

def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def normalize_text(text):
    # Normaliza el texto removiendo caracteres especiales y múltiples espacios
    text = re.sub(r'[\W_]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower()

def compare_texts_spacy(text1, text2):
    doc1 = nlp(text1)
    doc2 = nlp(text2)

    # Calculate similarity
    similarity = doc1.similarity(doc2) * 100

    # Compare texts using SequenceMatcher for more granular differences
    matcher = SequenceMatcher(None, text1, text2)
    diffs = matcher.get_opcodes()

    additions = []
    subtractions = []
    for tag, i1, i2, j1, j2 in diffs:
        if tag == 'insert':
            additions.append(text2[j1:j2])
        elif tag == 'delete':
            subtractions.append(text1[i1:i2])
        elif tag == 'replace':
            subtractions.append(text1[i1:i2])
            additions.append(text2[j1:j2])

    return additions, subtractions, similarity

def save_differences(additions, subtractions, similarity, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(f"Similarity: {similarity:.2f}%\n\n")
        file.write("Additions:\n")
        for line in additions:
            file.write(f"{line}\n")
        file.write("\nSubtractions:\n")
        for line in subtractions:
            file.write(f"{line}\n")

In [31]:
# Paths to the PDF files
official_pdf_path = "transcripcion/dinaboluarte/2023_transcripcion_literal.pdf"
transcribed_pdf_path = "transcripcion/dinaboluarte/2023_transcripcion_ia_2.pdf"
output_txt_path = "transcripcion/dinaboluarte/diff_output.txt"

# Read the PDFs
official_text = read_pdf(official_pdf_path)
transcribed_text = read_pdf(transcribed_pdf_path)

# Normalize the texts
normalized_official_text = normalize_text(official_text)
normalized_transcribed_text = normalize_text(transcribed_text)

# Compare the texts using Spacy
additions, subtractions, similarity = compare_texts_spacy(normalized_official_text, normalized_transcribed_text)

# Save the differences and similarity to a txt file
save_differences(additions, subtractions, similarity, output_txt_path)

print(f"Comparison complete. Results saved to {output_txt_path}")


  similarity = doc1.similarity(doc2) * 100


Comparison complete. Results saved to transcripcion/dinaboluarte/diff_output.txt


### Con NLTK

In [8]:
import PyPDF2
import nltk
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Descargar stopwords de nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Función para extraer texto de un PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

# Función para preprocesar el texto
def preprocess_text(text):
    stop_words = set(stopwords.words('spanish'))
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
    return ' '.join(tokens)

# Función para calcular la similitud entre dos documentos
def calculate_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix[0, 1]

def find_differences(text1, text2, output_file):
    diff = difflib.ndiff(text1.split(), text2.split())
    differences = [line for line in diff if line.startswith('- ') or line.startswith('+ ')]
    with open(output_file, 'w') as file:
        for line in differences:
            file.write(line + '\n')
            
# Rutas a los archivos PDF
pdf_path1 = 'transcripcion/ppk/transcripcion_literal.pdf'
pdf_path2 = 'transcripcion/ppk/transcripcion_ia.pdf'
output_file = "transcripcion/ppk/diff_output.txt"

# Extraer y preprocesar el texto de los PDFs
text1 = extract_text_from_pdf(pdf_path1)
text2 = extract_text_from_pdf(pdf_path2)
preprocessed_text1 = preprocess_text(text1)
preprocessed_text2 = preprocess_text(text2)

# Calcular la similitud
similarity = calculate_similarity(preprocessed_text1, preprocessed_text2)
print(f'La similitud entre los dos documentos es: {similarity*100:.2f}%')

# Encontrar diferencias y guardarlas en un archivo
find_differences(preprocessed_text1, preprocessed_text2, output_file)
print(f'Las diferencias se han guardado en {output_file}')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Brillitt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Brillitt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


La similitud entre los dos documentos es: 92.29%
Las diferencias se han guardado en transcripcion/ppk/diff_output.txt
