In [2]:
pip install pdfplumber pytesseract pdf2image pandas openpyxl




In [3]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [4]:
poppler_path = r"C:\Users\Arthur\Documents\Python Packages\poppler-24.08.0\Library\bin"

In [8]:
import os
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import re
from math import sqrt
from pathlib import Path


# üîπ Carregar dicion√°rios para o New Dale-Chall Score

with open("F:\\BACKUP GERAL 01.06.2020\\UFSC\\2025\\Editais\\Concurso COLLABCOOP\\DICION√ÅRIO\\dicionario_palavras_dificeis.txt", "r", encoding="utf-8") as f:
    palavras_dificeis = set(f.read().splitlines())
    
    
    # üîπ Remover t√≠tulos, cap√≠tulos, se√ß√µes, subse√ß√µes e artigos antes de contar senten√ßas
    patterns_to_remove = [
        r'^\s*T√çTULO\s+[IVXLCDM\d]+',  # T√çTULO I, T√çTULO II, etc.
        r'^\s*CAP√çTULO\s+[IVXLCDM\d]+',  # CAP√çTULO I, CAP√çTULO II, etc.
        r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+',  # SE√á√ÉO I, SE√á√ÉO II, etc.
        r'^\s*SUBSE√á√ÉO\s+[IVXLCDM\d]+',  # SUBSE√á√ÉO I, SUBSE√á√ÉO II, etc.
        r'^\s*Art\.?\s*\d+[¬∫¬∞]?',  # Art. 1¬∫, Art. 2, etc.
        r'^\s*¬ß\s*\d+[¬∫¬∞]?',  # ¬ß 1¬∫, ¬ß 2¬∫, etc.
        r'^\s*Par√°grafo √∫nico\.',  # "Par√°grafo √∫nico."
    ]

    # Aplicar a remo√ß√£o das senten√ßas irrelevantes
    sentences = [
        sentence.strip() for sentence in sentences
        if sentence.strip() and not any(re.match(pattern, sentence.strip(), re.IGNORECASE) for pattern in patterns_to_remove)
    ]

    # Atualizar a contagem de senten√ßas ap√≥s a filtragem
    S = max(len(sentences), 1)

    # C√°lculo das palavras longas e complexas
    long_words = [w for w in words if len(w) > 6]
    complex_words = [w for w in words if len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{2,}', w, re.I)) > 1]
    syllables = sum(len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{1,2}', w, re.I)) for w in words)

    W = len(words)
    LW = len(long_words)
    CW = len(complex_words)
    C = sum(len(word) for word in words if word.isalpha())

    # üîπ Contagem de par√°grafos corrigida
    paragraph_count = len(re.findall(r'\n\s*\n', text))
    if paragraph_count == 0:
        paragraph_count = max(S // 3, 1)

    # üîπ √çndices de legibilidade
    flesch = round(0.39 * (W / S) + 11.8 * (syllables / W) - 15.59, 2) if W > 0 and S > 0 else 0
    ari = round(4.71 * (C / W) + 0.5 * (W / S) - 21.43, 2) if W > 0 and S > 0 else 0
    lix = round((100 * LW / W) + (W / S), 2) if W > 0 else 0
    rix = round(LW / S, 2) if S > 0 else 0
    fog = round(0.4 * ((W / S) + (40 * CW / W)), 2) if W > 0 and S > 0 else 0
    smog = round(1.043 * sqrt(30 * (CW / S)) + 3.1291, 2) if S > 0 and CW > 0 else 0

    # üîπ New Dale-Chall Score
    difficult_words = sum(1 for w in words if w.lower() in palavras_dificeis)
    ndc = round(0.1579 * (difficult_words / W * 100) + 0.0496 * (W / S) + 3.6365, 2) if W > 0 and S > 0 else 0

    # üîπ Type-Token Ratio (TTR)
    ttr = round(len(set(words)) / W, 4) if W > 0 else 0

    # üîπ Herdan‚Äôs C (Complexidade Lexical)
    herdan_c = round(sqrt(len(set(words)) / W), 4) if W > 0 else 0

    return {
        'LIX Index': lix,
        'RIX Index': rix,
        'ARI Index': ari,
        'Flesch-Kincaid': flesch,
        'Fog Index': fog,
        'SMOG Index': smog,
        'New Dale-Chall': ndc,
        'Type-Token Ratio (TTR)': ttr,
        'Herdan‚Äôs C': herdan_c,
        'Paragraph Count': paragraph_count
    }

# üîπ Fun√ß√£o para contar T√çTULO, CAP√çTULO, SE√á√ÉO e ARTIGOS
def count_sections(text):
    title_count = len(re.findall(r'^\s*T√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    chapter_count = len(re.findall(r'^\s*CAP√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    section_count = len(re.findall(r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    article_count = len(re.findall(r'^\s*Art\.?\s*\d+', text, re.MULTILINE | re.IGNORECASE))

    return {
        'Title Count': title_count,
        'Chapter Count': chapter_count,
        'Section Count': section_count,
        'Article Count': article_count
    }

# üîπ Fun√ß√£o para extrair CNPJ
def extract_cnpj(text):
    match = re.search(r'CNPJ\D*(\d{2}\.?\d{3}\.?\d{3}/?\d{4}-?\d{2})', text)
    return match.group(1).replace(".", "").replace("/", "").replace("-", "") if match else None

# üîπ Fun√ß√£o para calcular estat√≠sticas do texto
def get_text_stats(text):
    words = text.split()
    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    character_count = sum(len(word) for word in words)
    unique_words = len(set(words))
    readability_indices = calculate_readability_indices(text)
    section_counts = count_sections(text)

    return {
        'Word Count': len(words),
        'Sentence Count': len(sentences),
        'Character Count': character_count,
        'Unique Words': unique_words,
        **readability_indices,
        **section_counts
    }

# üîπ Escolha do Ano
year = 2016  # Altere conforme necess√°rio

# üîπ Diret√≥rios baseados no ano
directory_path = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2024\Artigo 2 - TESE\Estatutos\{year}"
output_directory = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final"
os.makedirs(output_directory, exist_ok=True)
output_file = os.path.join(output_directory, f"estatutos_resultados_{year}.xlsx")

# üîπ Processar os PDFs
data = []
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    if filename.endswith(".pdf") and os.path.isfile(file_path):
        text = extract_text_from_pdf(file_path)
        if not text:
            continue
        cnpj = extract_cnpj(text)
        stats = get_text_stats(text)
        stats['File Name'] = filename
        stats['CNPJ'] = cnpj
        stats['Page Count'] = len(pdfplumber.open(file_path).pages)
        stats['File Size (KB)'] = round(Path(file_path).stat().st_size / 1024, 2)
        data.append(stats)

result_df = pd.DataFrame(data)

# üîπ Organizar colunas com "File Name" e "CNPJ" na frente
column_order = ['File Name', 'CNPJ'] + [col for col in result_df.columns if col not in ['File Name', 'CNPJ']]
result_df = result_df[column_order]

# üîπ Salvar no Excel
result_df.to_excel(output_file, index=False, engine="openpyxl")

print(f"‚úÖ Processamento conclu√≠do! Dados salvos em: {output_file}")

SyntaxError: 'return' outside function (1009751755.py, line 70)

EXTRA√á√ÉO - 2016

In [9]:
import os
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import re
from math import sqrt
from pathlib import Path

# üîπ Carregar dicion√°rio de palavras dif√≠ceis para New Dale-Chall Score
with open("F:\\BACKUP GERAL 01.06.2020\\UFSC\\2025\\Editais\\Concurso COLLABCOOP\\DICION√ÅRIO\\dicionario_palavras_dificeis.txt", "r", encoding="utf-8") as f:
    palavras_dificeis = set(f.read().splitlines())

# üîπ Fun√ß√£o para calcular √≠ndices de legibilidade
def calculate_readability_indices(text):
    patterns_to_remove = [
        r'^\s*T√çTULO\s+[IVXLCDM\d]+',
        r'^\s*CAP√çTULO\s+[IVXLCDM\d]+',
        r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*SUBSE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*Art\.?\s*\d+[¬∫¬∞]?',
        r'^\s*¬ß\s*\d+[¬∫¬∞]?',
        r'^\s*Par√°grafo √∫nico\.',
    ]

    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    sentences = [
        sentence.strip() for sentence in sentences
        if sentence.strip() and not any(re.match(pattern, sentence.strip(), re.IGNORECASE) for pattern in patterns_to_remove)
    ]
    S = max(len(sentences), 1)

    words = text.split()
    long_words = [w for w in words if len(w) > 6]
    complex_words = [w for w in words if len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{2,}', w, re.I)) > 1]
    syllables = sum(len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{1,2}', w, re.I)) for w in words)

    W = len(words)
    LW = len(long_words)
    CW = len(complex_words)
    C = sum(len(word) for word in words if word.isalpha())

    paragraph_count = len(re.findall(r'\n\s*\n', text))
    if paragraph_count == 0:
        paragraph_count = max(S // 3, 1)

    flesch = round(0.39 * (W / S) + 11.8 * (syllables / W) - 15.59, 2) if W > 0 and S > 0 else 0
    ari = round(4.71 * (C / W) + 0.5 * (W / S) - 21.43, 2) if W > 0 and S > 0 else 0
    lix = round((100 * LW / W) + (W / S), 2) if W > 0 else 0
    rix = round(LW / S, 2) if S > 0 else 0
    fog = round(0.4 * ((W / S) + (40 * CW / W)), 2) if W > 0 and S > 0 else 0
    smog = round(1.043 * sqrt(30 * (CW / S)) + 3.1291, 2) if S > 0 and CW > 0 else 0

    difficult_words = sum(1 for w in words if w.lower() in palavras_dificeis)
    ndc = round(0.1579 * (difficult_words / W * 100) + 0.0496 * (W / S) + 3.6365, 2) if W > 0 and S > 0 else 0

    ttr = round(len(set(words)) / W, 4) if W > 0 else 0
    herdan_c = round(sqrt(len(set(words)) / W), 4) if W > 0 else 0

    return {
        'LIX Index': lix,
        'RIX Index': rix,
        'ARI Index': ari,
        'Flesch-Kincaid': flesch,
        'Fog Index': fog,
        'SMOG Index': smog,
        'New Dale-Chall': ndc,
        'Type-Token Ratio (TTR)': ttr,
        'Herdan‚Äôs C': herdan_c,
        'Paragraph Count': paragraph_count
    }

# üîπ Fun√ß√£o para contar se√ß√µes, cap√≠tulos e artigos
def count_sections(text):
    title_count = len(re.findall(r'^\s*T√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    chapter_count = len(re.findall(r'^\s*CAP√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    section_count = len(re.findall(r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    article_count = len(re.findall(r'^\s*Art\.?\s*\d+', text, re.MULTILINE | re.IGNORECASE))

    return {
        'Title Count': title_count,
        'Chapter Count': chapter_count,
        'Section Count': section_count,
        'Article Count': article_count
    }

# üîπ Fun√ß√£o para extrair CNPJ
def extract_cnpj(text):
    match = re.search(r'CNPJ\D*(\d{2}\.?\d{3}\.?\d{3}/?\d{4}-?\d{2})', text)
    return match.group(1).replace(".", "").replace("/", "").replace("-", "") if match else None

# üîπ Fun√ß√£o para calcular estat√≠sticas do texto
def get_text_stats(text):
    words = text.split()
    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    character_count = sum(len(word) for word in words)
    unique_words = len(set(words))
    readability_indices = calculate_readability_indices(text)
    section_counts = count_sections(text)

    return {
        'Word Count': len(words),
        'Sentence Count': len(sentences),
        'Character Count': character_count,
        'Unique Words': unique_words,
        **readability_indices,
        **section_counts
    }

# üîπ Processar os PDFs
year = 2016
directory_path = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2024\Artigo 2 - TESE\Estatutos\{year}"
output_directory = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final"
os.makedirs(output_directory, exist_ok=True)
output_file = os.path.join(output_directory, f"estatutos_resultados_{year}.xlsx")

data = []
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    if filename.endswith(".pdf") and os.path.isfile(file_path):
        text = extract_text_from_pdf(file_path)
        if not text:
            continue
        cnpj = extract_cnpj(text)
        stats = get_text_stats(text)
        stats['File Name'] = filename
        stats['CNPJ'] = cnpj
        stats['Page Count'] = len(pdfplumber.open(file_path).pages)
        stats['File Size (KB)'] = round(Path(file_path).stat().st_size / 1024, 2)
        data.append(stats)

result_df = pd.DataFrame(data)
column_order = ['File Name', 'CNPJ'] + [col for col in result_df.columns if col not in ['File Name', 'CNPJ']]
result_df = result_df[column_order]

# üîπ Salvar no Excel
result_df.to_excel(output_file, index=False, engine="openpyxl")
print(f"‚úÖ Processamento conclu√≠do! Dados salvos em: {output_file}")


‚úÖ Processamento conclu√≠do! Dados salvos em: F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final\estatutos_resultados_2016.xlsx


EXTRA√á√ÉO - 2017

In [10]:
import os
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import re
from math import sqrt
from pathlib import Path

# üîπ Carregar dicion√°rio de palavras dif√≠ceis para New Dale-Chall Score
with open("F:\\BACKUP GERAL 01.06.2020\\UFSC\\2025\\Editais\\Concurso COLLABCOOP\\DICION√ÅRIO\\dicionario_palavras_dificeis.txt", "r", encoding="utf-8") as f:
    palavras_dificeis = set(f.read().splitlines())

# üîπ Fun√ß√£o para calcular √≠ndices de legibilidade
def calculate_readability_indices(text):
    patterns_to_remove = [
        r'^\s*T√çTULO\s+[IVXLCDM\d]+',
        r'^\s*CAP√çTULO\s+[IVXLCDM\d]+',
        r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*SUBSE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*Art\.?\s*\d+[¬∫¬∞]?',
        r'^\s*¬ß\s*\d+[¬∫¬∞]?',
        r'^\s*Par√°grafo √∫nico\.',
    ]

    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    sentences = [
        sentence.strip() for sentence in sentences
        if sentence.strip() and not any(re.match(pattern, sentence.strip(), re.IGNORECASE) for pattern in patterns_to_remove)
    ]
    S = max(len(sentences), 1)

    words = text.split()
    long_words = [w for w in words if len(w) > 6]
    complex_words = [w for w in words if len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{2,}', w, re.I)) > 1]
    syllables = sum(len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{1,2}', w, re.I)) for w in words)

    W = len(words)
    LW = len(long_words)
    CW = len(complex_words)
    C = sum(len(word) for word in words if word.isalpha())

    paragraph_count = len(re.findall(r'\n\s*\n', text))
    if paragraph_count == 0:
        paragraph_count = max(S // 3, 1)

    flesch = round(0.39 * (W / S) + 11.8 * (syllables / W) - 15.59, 2) if W > 0 and S > 0 else 0
    ari = round(4.71 * (C / W) + 0.5 * (W / S) - 21.43, 2) if W > 0 and S > 0 else 0
    lix = round((100 * LW / W) + (W / S), 2) if W > 0 else 0
    rix = round(LW / S, 2) if S > 0 else 0
    fog = round(0.4 * ((W / S) + (40 * CW / W)), 2) if W > 0 and S > 0 else 0
    smog = round(1.043 * sqrt(30 * (CW / S)) + 3.1291, 2) if S > 0 and CW > 0 else 0

    difficult_words = sum(1 for w in words if w.lower() in palavras_dificeis)
    ndc = round(0.1579 * (difficult_words / W * 100) + 0.0496 * (W / S) + 3.6365, 2) if W > 0 and S > 0 else 0

    ttr = round(len(set(words)) / W, 4) if W > 0 else 0
    herdan_c = round(sqrt(len(set(words)) / W), 4) if W > 0 else 0

    return {
        'LIX Index': lix,
        'RIX Index': rix,
        'ARI Index': ari,
        'Flesch-Kincaid': flesch,
        'Fog Index': fog,
        'SMOG Index': smog,
        'New Dale-Chall': ndc,
        'Type-Token Ratio (TTR)': ttr,
        'Herdan‚Äôs C': herdan_c,
        'Paragraph Count': paragraph_count
    }

# üîπ Fun√ß√£o para contar se√ß√µes, cap√≠tulos e artigos
def count_sections(text):
    title_count = len(re.findall(r'^\s*T√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    chapter_count = len(re.findall(r'^\s*CAP√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    section_count = len(re.findall(r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    article_count = len(re.findall(r'^\s*Art\.?\s*\d+', text, re.MULTILINE | re.IGNORECASE))

    return {
        'Title Count': title_count,
        'Chapter Count': chapter_count,
        'Section Count': section_count,
        'Article Count': article_count
    }

# üîπ Fun√ß√£o para extrair CNPJ
def extract_cnpj(text):
    match = re.search(r'CNPJ\D*(\d{2}\.?\d{3}\.?\d{3}/?\d{4}-?\d{2})', text)
    return match.group(1).replace(".", "").replace("/", "").replace("-", "") if match else None

# üîπ Fun√ß√£o para calcular estat√≠sticas do texto
def get_text_stats(text):
    words = text.split()
    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    character_count = sum(len(word) for word in words)
    unique_words = len(set(words))
    readability_indices = calculate_readability_indices(text)
    section_counts = count_sections(text)

    return {
        'Word Count': len(words),
        'Sentence Count': len(sentences),
        'Character Count': character_count,
        'Unique Words': unique_words,
        **readability_indices,
        **section_counts
    }

# üîπ Processar os PDFs
year = 2017
directory_path = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2024\Artigo 2 - TESE\Estatutos\{year}"
output_directory = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final"
os.makedirs(output_directory, exist_ok=True)
output_file = os.path.join(output_directory, f"estatutos_resultados_{year}.xlsx")

data = []
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    if filename.endswith(".pdf") and os.path.isfile(file_path):
        text = extract_text_from_pdf(file_path)
        if not text:
            continue
        cnpj = extract_cnpj(text)
        stats = get_text_stats(text)
        stats['File Name'] = filename
        stats['CNPJ'] = cnpj
        stats['Page Count'] = len(pdfplumber.open(file_path).pages)
        stats['File Size (KB)'] = round(Path(file_path).stat().st_size / 1024, 2)
        data.append(stats)

result_df = pd.DataFrame(data)
column_order = ['File Name', 'CNPJ'] + [col for col in result_df.columns if col not in ['File Name', 'CNPJ']]
result_df = result_df[column_order]

# üîπ Salvar no Excel
result_df.to_excel(output_file, index=False, engine="openpyxl")
print(f"‚úÖ Processamento conclu√≠do! Dados salvos em: {output_file}")


‚úÖ Processamento conclu√≠do! Dados salvos em: F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final\estatutos_resultados_2017.xlsx


EXTRA√á√ÉO - 2018

In [12]:
import os
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import re
from math import sqrt
from pathlib import Path

# üîπ Carregar dicion√°rio de palavras dif√≠ceis para New Dale-Chall Score
with open("F:\\BACKUP GERAL 01.06.2020\\UFSC\\2025\\Editais\\Concurso COLLABCOOP\\DICION√ÅRIO\\dicionario_palavras_dificeis.txt", "r", encoding="utf-8") as f:
    palavras_dificeis = set(f.read().splitlines())

# üîπ Fun√ß√£o para calcular √≠ndices de legibilidade
def calculate_readability_indices(text):
    patterns_to_remove = [
        r'^\s*T√çTULO\s+[IVXLCDM\d]+',
        r'^\s*CAP√çTULO\s+[IVXLCDM\d]+',
        r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*SUBSE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*Art\.?\s*\d+[¬∫¬∞]?',
        r'^\s*¬ß\s*\d+[¬∫¬∞]?',
        r'^\s*Par√°grafo √∫nico\.',
    ]

    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    sentences = [
        sentence.strip() for sentence in sentences
        if sentence.strip() and not any(re.match(pattern, sentence.strip(), re.IGNORECASE) for pattern in patterns_to_remove)
    ]
    S = max(len(sentences), 1)

    words = text.split()
    long_words = [w for w in words if len(w) > 6]
    complex_words = [w for w in words if len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{2,}', w, re.I)) > 1]
    syllables = sum(len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{1,2}', w, re.I)) for w in words)

    W = len(words)
    LW = len(long_words)
    CW = len(complex_words)
    C = sum(len(word) for word in words if word.isalpha())

    paragraph_count = len(re.findall(r'\n\s*\n', text))
    if paragraph_count == 0:
        paragraph_count = max(S // 3, 1)

    flesch = round(0.39 * (W / S) + 11.8 * (syllables / W) - 15.59, 2) if W > 0 and S > 0 else 0
    ari = round(4.71 * (C / W) + 0.5 * (W / S) - 21.43, 2) if W > 0 and S > 0 else 0
    lix = round((100 * LW / W) + (W / S), 2) if W > 0 else 0
    rix = round(LW / S, 2) if S > 0 else 0
    fog = round(0.4 * ((W / S) + (40 * CW / W)), 2) if W > 0 and S > 0 else 0
    smog = round(1.043 * sqrt(30 * (CW / S)) + 3.1291, 2) if S > 0 and CW > 0 else 0

    difficult_words = sum(1 for w in words if w.lower() in palavras_dificeis)
    ndc = round(0.1579 * (difficult_words / W * 100) + 0.0496 * (W / S) + 3.6365, 2) if W > 0 and S > 0 else 0

    ttr = round(len(set(words)) / W, 4) if W > 0 else 0
    herdan_c = round(sqrt(len(set(words)) / W), 4) if W > 0 else 0

    return {
        'LIX Index': lix,
        'RIX Index': rix,
        'ARI Index': ari,
        'Flesch-Kincaid': flesch,
        'Fog Index': fog,
        'SMOG Index': smog,
        'New Dale-Chall': ndc,
        'Type-Token Ratio (TTR)': ttr,
        'Herdan‚Äôs C': herdan_c,
        'Paragraph Count': paragraph_count
    }

# üîπ Fun√ß√£o para contar se√ß√µes, cap√≠tulos e artigos
def count_sections(text):
    title_count = len(re.findall(r'^\s*T√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    chapter_count = len(re.findall(r'^\s*CAP√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    section_count = len(re.findall(r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    article_count = len(re.findall(r'^\s*Art\.?\s*\d+', text, re.MULTILINE | re.IGNORECASE))

    return {
        'Title Count': title_count,
        'Chapter Count': chapter_count,
        'Section Count': section_count,
        'Article Count': article_count
    }

# üîπ Fun√ß√£o para extrair CNPJ
def extract_cnpj(text):
    match = re.search(r'CNPJ\D*(\d{2}\.?\d{3}\.?\d{3}/?\d{4}-?\d{2})', text)
    return match.group(1).replace(".", "").replace("/", "").replace("-", "") if match else None

# üîπ Fun√ß√£o para calcular estat√≠sticas do texto
def get_text_stats(text):
    words = text.split()
    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    character_count = sum(len(word) for word in words)
    unique_words = len(set(words))
    readability_indices = calculate_readability_indices(text)
    section_counts = count_sections(text)

    return {
        'Word Count': len(words),
        'Sentence Count': len(sentences),
        'Character Count': character_count,
        'Unique Words': unique_words,
        **readability_indices,
        **section_counts
    }

# üîπ Processar os PDFs
year = 2018
directory_path = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2024\Artigo 2 - TESE\Estatutos\{year}"
output_directory = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final"
os.makedirs(output_directory, exist_ok=True)
output_file = os.path.join(output_directory, f"estatutos_resultados_{year}.xlsx")

data = []
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    if filename.endswith(".pdf") and os.path.isfile(file_path):
        text = extract_text_from_pdf(file_path)
        if not text:
            continue
        cnpj = extract_cnpj(text)
        stats = get_text_stats(text)
        stats['File Name'] = filename
        stats['CNPJ'] = cnpj
        stats['Page Count'] = len(pdfplumber.open(file_path).pages)
        stats['File Size (KB)'] = round(Path(file_path).stat().st_size / 1024, 2)
        data.append(stats)

result_df = pd.DataFrame(data)
column_order = ['File Name', 'CNPJ'] + [col for col in result_df.columns if col not in ['File Name', 'CNPJ']]
result_df = result_df[column_order]

# üîπ Salvar no Excel
result_df.to_excel(output_file, index=False, engine="openpyxl")
print(f"‚úÖ Processamento conclu√≠do! Dados salvos em: {output_file}")


‚úÖ Processamento conclu√≠do! Dados salvos em: F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final\estatutos_resultados_2018.xlsx


EXTRA√á√ÉO - 2019

In [13]:
import os
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import re
from math import sqrt
from pathlib import Path

# üîπ Carregar dicion√°rio de palavras dif√≠ceis para New Dale-Chall Score
with open("F:\\BACKUP GERAL 01.06.2020\\UFSC\\2025\\Editais\\Concurso COLLABCOOP\\DICION√ÅRIO\\dicionario_palavras_dificeis.txt", "r", encoding="utf-8") as f:
    palavras_dificeis = set(f.read().splitlines())

# üîπ Fun√ß√£o para calcular √≠ndices de legibilidade
def calculate_readability_indices(text):
    patterns_to_remove = [
        r'^\s*T√çTULO\s+[IVXLCDM\d]+',
        r'^\s*CAP√çTULO\s+[IVXLCDM\d]+',
        r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*SUBSE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*Art\.?\s*\d+[¬∫¬∞]?',
        r'^\s*¬ß\s*\d+[¬∫¬∞]?',
        r'^\s*Par√°grafo √∫nico\.',
    ]

    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    sentences = [
        sentence.strip() for sentence in sentences
        if sentence.strip() and not any(re.match(pattern, sentence.strip(), re.IGNORECASE) for pattern in patterns_to_remove)
    ]
    S = max(len(sentences), 1)

    words = text.split()
    long_words = [w for w in words if len(w) > 6]
    complex_words = [w for w in words if len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{2,}', w, re.I)) > 1]
    syllables = sum(len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{1,2}', w, re.I)) for w in words)

    W = len(words)
    LW = len(long_words)
    CW = len(complex_words)
    C = sum(len(word) for word in words if word.isalpha())

    paragraph_count = len(re.findall(r'\n\s*\n', text))
    if paragraph_count == 0:
        paragraph_count = max(S // 3, 1)

    flesch = round(0.39 * (W / S) + 11.8 * (syllables / W) - 15.59, 2) if W > 0 and S > 0 else 0
    ari = round(4.71 * (C / W) + 0.5 * (W / S) - 21.43, 2) if W > 0 and S > 0 else 0
    lix = round((100 * LW / W) + (W / S), 2) if W > 0 else 0
    rix = round(LW / S, 2) if S > 0 else 0
    fog = round(0.4 * ((W / S) + (40 * CW / W)), 2) if W > 0 and S > 0 else 0
    smog = round(1.043 * sqrt(30 * (CW / S)) + 3.1291, 2) if S > 0 and CW > 0 else 0

    difficult_words = sum(1 for w in words if w.lower() in palavras_dificeis)
    ndc = round(0.1579 * (difficult_words / W * 100) + 0.0496 * (W / S) + 3.6365, 2) if W > 0 and S > 0 else 0

    ttr = round(len(set(words)) / W, 4) if W > 0 else 0
    herdan_c = round(sqrt(len(set(words)) / W), 4) if W > 0 else 0

    return {
        'LIX Index': lix,
        'RIX Index': rix,
        'ARI Index': ari,
        'Flesch-Kincaid': flesch,
        'Fog Index': fog,
        'SMOG Index': smog,
        'New Dale-Chall': ndc,
        'Type-Token Ratio (TTR)': ttr,
        'Herdan‚Äôs C': herdan_c,
        'Paragraph Count': paragraph_count
    }

# üîπ Fun√ß√£o para contar se√ß√µes, cap√≠tulos e artigos
def count_sections(text):
    title_count = len(re.findall(r'^\s*T√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    chapter_count = len(re.findall(r'^\s*CAP√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    section_count = len(re.findall(r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    article_count = len(re.findall(r'^\s*Art\.?\s*\d+', text, re.MULTILINE | re.IGNORECASE))

    return {
        'Title Count': title_count,
        'Chapter Count': chapter_count,
        'Section Count': section_count,
        'Article Count': article_count
    }

# üîπ Fun√ß√£o para extrair CNPJ
def extract_cnpj(text):
    match = re.search(r'CNPJ\D*(\d{2}\.?\d{3}\.?\d{3}/?\d{4}-?\d{2})', text)
    return match.group(1).replace(".", "").replace("/", "").replace("-", "") if match else None

# üîπ Fun√ß√£o para calcular estat√≠sticas do texto
def get_text_stats(text):
    words = text.split()
    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    character_count = sum(len(word) for word in words)
    unique_words = len(set(words))
    readability_indices = calculate_readability_indices(text)
    section_counts = count_sections(text)

    return {
        'Word Count': len(words),
        'Sentence Count': len(sentences),
        'Character Count': character_count,
        'Unique Words': unique_words,
        **readability_indices,
        **section_counts
    }

# üîπ Processar os PDFs
year = 2019
directory_path = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2024\Artigo 2 - TESE\Estatutos\{year}"
output_directory = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final"
os.makedirs(output_directory, exist_ok=True)
output_file = os.path.join(output_directory, f"estatutos_resultados_{year}.xlsx")

data = []
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    if filename.endswith(".pdf") and os.path.isfile(file_path):
        text = extract_text_from_pdf(file_path)
        if not text:
            continue
        cnpj = extract_cnpj(text)
        stats = get_text_stats(text)
        stats['File Name'] = filename
        stats['CNPJ'] = cnpj
        stats['Page Count'] = len(pdfplumber.open(file_path).pages)
        stats['File Size (KB)'] = round(Path(file_path).stat().st_size / 1024, 2)
        data.append(stats)

result_df = pd.DataFrame(data)
column_order = ['File Name', 'CNPJ'] + [col for col in result_df.columns if col not in ['File Name', 'CNPJ']]
result_df = result_df[column_order]

# üîπ Salvar no Excel
result_df.to_excel(output_file, index=False, engine="openpyxl")
print(f"‚úÖ Processamento conclu√≠do! Dados salvos em: {output_file}")


‚úÖ Processamento conclu√≠do! Dados salvos em: F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final\estatutos_resultados_2019.xlsx


EXTRA√á√ÉO - 2020

In [14]:
import os
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import re
from math import sqrt
from pathlib import Path

# üîπ Carregar dicion√°rio de palavras dif√≠ceis para New Dale-Chall Score
with open("F:\\BACKUP GERAL 01.06.2020\\UFSC\\2025\\Editais\\Concurso COLLABCOOP\\DICION√ÅRIO\\dicionario_palavras_dificeis.txt", "r", encoding="utf-8") as f:
    palavras_dificeis = set(f.read().splitlines())

# üîπ Fun√ß√£o para calcular √≠ndices de legibilidade
def calculate_readability_indices(text):
    patterns_to_remove = [
        r'^\s*T√çTULO\s+[IVXLCDM\d]+',
        r'^\s*CAP√çTULO\s+[IVXLCDM\d]+',
        r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*SUBSE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*Art\.?\s*\d+[¬∫¬∞]?',
        r'^\s*¬ß\s*\d+[¬∫¬∞]?',
        r'^\s*Par√°grafo √∫nico\.',
    ]

    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    sentences = [
        sentence.strip() for sentence in sentences
        if sentence.strip() and not any(re.match(pattern, sentence.strip(), re.IGNORECASE) for pattern in patterns_to_remove)
    ]
    S = max(len(sentences), 1)

    words = text.split()
    long_words = [w for w in words if len(w) > 6]
    complex_words = [w for w in words if len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{2,}', w, re.I)) > 1]
    syllables = sum(len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{1,2}', w, re.I)) for w in words)

    W = len(words)
    LW = len(long_words)
    CW = len(complex_words)
    C = sum(len(word) for word in words if word.isalpha())

    paragraph_count = len(re.findall(r'\n\s*\n', text))
    if paragraph_count == 0:
        paragraph_count = max(S // 3, 1)

    flesch = round(0.39 * (W / S) + 11.8 * (syllables / W) - 15.59, 2) if W > 0 and S > 0 else 0
    ari = round(4.71 * (C / W) + 0.5 * (W / S) - 21.43, 2) if W > 0 and S > 0 else 0
    lix = round((100 * LW / W) + (W / S), 2) if W > 0 else 0
    rix = round(LW / S, 2) if S > 0 else 0
    fog = round(0.4 * ((W / S) + (40 * CW / W)), 2) if W > 0 and S > 0 else 0
    smog = round(1.043 * sqrt(30 * (CW / S)) + 3.1291, 2) if S > 0 and CW > 0 else 0

    difficult_words = sum(1 for w in words if w.lower() in palavras_dificeis)
    ndc = round(0.1579 * (difficult_words / W * 100) + 0.0496 * (W / S) + 3.6365, 2) if W > 0 and S > 0 else 0

    ttr = round(len(set(words)) / W, 4) if W > 0 else 0
    herdan_c = round(sqrt(len(set(words)) / W), 4) if W > 0 else 0

    return {
        'LIX Index': lix,
        'RIX Index': rix,
        'ARI Index': ari,
        'Flesch-Kincaid': flesch,
        'Fog Index': fog,
        'SMOG Index': smog,
        'New Dale-Chall': ndc,
        'Type-Token Ratio (TTR)': ttr,
        'Herdan‚Äôs C': herdan_c,
        'Paragraph Count': paragraph_count
    }

# üîπ Fun√ß√£o para contar se√ß√µes, cap√≠tulos e artigos
def count_sections(text):
    title_count = len(re.findall(r'^\s*T√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    chapter_count = len(re.findall(r'^\s*CAP√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    section_count = len(re.findall(r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    article_count = len(re.findall(r'^\s*Art\.?\s*\d+', text, re.MULTILINE | re.IGNORECASE))

    return {
        'Title Count': title_count,
        'Chapter Count': chapter_count,
        'Section Count': section_count,
        'Article Count': article_count
    }

# üîπ Fun√ß√£o para extrair CNPJ
def extract_cnpj(text):
    match = re.search(r'CNPJ\D*(\d{2}\.?\d{3}\.?\d{3}/?\d{4}-?\d{2})', text)
    return match.group(1).replace(".", "").replace("/", "").replace("-", "") if match else None

# üîπ Fun√ß√£o para calcular estat√≠sticas do texto
def get_text_stats(text):
    words = text.split()
    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    character_count = sum(len(word) for word in words)
    unique_words = len(set(words))
    readability_indices = calculate_readability_indices(text)
    section_counts = count_sections(text)

    return {
        'Word Count': len(words),
        'Sentence Count': len(sentences),
        'Character Count': character_count,
        'Unique Words': unique_words,
        **readability_indices,
        **section_counts
    }

# üîπ Processar os PDFs
year = 2020
directory_path = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2024\Artigo 2 - TESE\Estatutos\{year}"
output_directory = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final"
os.makedirs(output_directory, exist_ok=True)
output_file = os.path.join(output_directory, f"estatutos_resultados_{year}.xlsx")

data = []
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    if filename.endswith(".pdf") and os.path.isfile(file_path):
        text = extract_text_from_pdf(file_path)
        if not text:
            continue
        cnpj = extract_cnpj(text)
        stats = get_text_stats(text)
        stats['File Name'] = filename
        stats['CNPJ'] = cnpj
        stats['Page Count'] = len(pdfplumber.open(file_path).pages)
        stats['File Size (KB)'] = round(Path(file_path).stat().st_size / 1024, 2)
        data.append(stats)

result_df = pd.DataFrame(data)
column_order = ['File Name', 'CNPJ'] + [col for col in result_df.columns if col not in ['File Name', 'CNPJ']]
result_df = result_df[column_order]

# üîπ Salvar no Excel
result_df.to_excel(output_file, index=False, engine="openpyxl")
print(f"‚úÖ Processamento conclu√≠do! Dados salvos em: {output_file}")


‚úÖ Processamento conclu√≠do! Dados salvos em: F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final\estatutos_resultados_2020.xlsx


EXTRA√á√ÉO - 2021

In [15]:
import os
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import re
from math import sqrt
from pathlib import Path

# üîπ Carregar dicion√°rio de palavras dif√≠ceis para New Dale-Chall Score
with open("F:\\BACKUP GERAL 01.06.2020\\UFSC\\2025\\Editais\\Concurso COLLABCOOP\\DICION√ÅRIO\\dicionario_palavras_dificeis.txt", "r", encoding="utf-8") as f:
    palavras_dificeis = set(f.read().splitlines())

# üîπ Fun√ß√£o para calcular √≠ndices de legibilidade
def calculate_readability_indices(text):
    patterns_to_remove = [
        r'^\s*T√çTULO\s+[IVXLCDM\d]+',
        r'^\s*CAP√çTULO\s+[IVXLCDM\d]+',
        r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*SUBSE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*Art\.?\s*\d+[¬∫¬∞]?',
        r'^\s*¬ß\s*\d+[¬∫¬∞]?',
        r'^\s*Par√°grafo √∫nico\.',
    ]

    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    sentences = [
        sentence.strip() for sentence in sentences
        if sentence.strip() and not any(re.match(pattern, sentence.strip(), re.IGNORECASE) for pattern in patterns_to_remove)
    ]
    S = max(len(sentences), 1)

    words = text.split()
    long_words = [w for w in words if len(w) > 6]
    complex_words = [w for w in words if len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{2,}', w, re.I)) > 1]
    syllables = sum(len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{1,2}', w, re.I)) for w in words)

    W = len(words)
    LW = len(long_words)
    CW = len(complex_words)
    C = sum(len(word) for word in words if word.isalpha())

    paragraph_count = len(re.findall(r'\n\s*\n', text))
    if paragraph_count == 0:
        paragraph_count = max(S // 3, 1)

    flesch = round(0.39 * (W / S) + 11.8 * (syllables / W) - 15.59, 2) if W > 0 and S > 0 else 0
    ari = round(4.71 * (C / W) + 0.5 * (W / S) - 21.43, 2) if W > 0 and S > 0 else 0
    lix = round((100 * LW / W) + (W / S), 2) if W > 0 else 0
    rix = round(LW / S, 2) if S > 0 else 0
    fog = round(0.4 * ((W / S) + (40 * CW / W)), 2) if W > 0 and S > 0 else 0
    smog = round(1.043 * sqrt(30 * (CW / S)) + 3.1291, 2) if S > 0 and CW > 0 else 0

    difficult_words = sum(1 for w in words if w.lower() in palavras_dificeis)
    ndc = round(0.1579 * (difficult_words / W * 100) + 0.0496 * (W / S) + 3.6365, 2) if W > 0 and S > 0 else 0

    ttr = round(len(set(words)) / W, 4) if W > 0 else 0
    herdan_c = round(sqrt(len(set(words)) / W), 4) if W > 0 else 0

    return {
        'LIX Index': lix,
        'RIX Index': rix,
        'ARI Index': ari,
        'Flesch-Kincaid': flesch,
        'Fog Index': fog,
        'SMOG Index': smog,
        'New Dale-Chall': ndc,
        'Type-Token Ratio (TTR)': ttr,
        'Herdan‚Äôs C': herdan_c,
        'Paragraph Count': paragraph_count
    }

# üîπ Fun√ß√£o para contar se√ß√µes, cap√≠tulos e artigos
def count_sections(text):
    title_count = len(re.findall(r'^\s*T√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    chapter_count = len(re.findall(r'^\s*CAP√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    section_count = len(re.findall(r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    article_count = len(re.findall(r'^\s*Art\.?\s*\d+', text, re.MULTILINE | re.IGNORECASE))

    return {
        'Title Count': title_count,
        'Chapter Count': chapter_count,
        'Section Count': section_count,
        'Article Count': article_count
    }

# üîπ Fun√ß√£o para extrair CNPJ
def extract_cnpj(text):
    match = re.search(r'CNPJ\D*(\d{2}\.?\d{3}\.?\d{3}/?\d{4}-?\d{2})', text)
    return match.group(1).replace(".", "").replace("/", "").replace("-", "") if match else None

# üîπ Fun√ß√£o para calcular estat√≠sticas do texto
def get_text_stats(text):
    words = text.split()
    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    character_count = sum(len(word) for word in words)
    unique_words = len(set(words))
    readability_indices = calculate_readability_indices(text)
    section_counts = count_sections(text)

    return {
        'Word Count': len(words),
        'Sentence Count': len(sentences),
        'Character Count': character_count,
        'Unique Words': unique_words,
        **readability_indices,
        **section_counts
    }

# üîπ Processar os PDFs
year = 2021
directory_path = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2024\Artigo 2 - TESE\Estatutos\{year}"
output_directory = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final"
os.makedirs(output_directory, exist_ok=True)
output_file = os.path.join(output_directory, f"estatutos_resultados_{year}.xlsx")

data = []
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    if filename.endswith(".pdf") and os.path.isfile(file_path):
        text = extract_text_from_pdf(file_path)
        if not text:
            continue
        cnpj = extract_cnpj(text)
        stats = get_text_stats(text)
        stats['File Name'] = filename
        stats['CNPJ'] = cnpj
        stats['Page Count'] = len(pdfplumber.open(file_path).pages)
        stats['File Size (KB)'] = round(Path(file_path).stat().st_size / 1024, 2)
        data.append(stats)

result_df = pd.DataFrame(data)
column_order = ['File Name', 'CNPJ'] + [col for col in result_df.columns if col not in ['File Name', 'CNPJ']]
result_df = result_df[column_order]

# üîπ Salvar no Excel
result_df.to_excel(output_file, index=False, engine="openpyxl")
print(f"‚úÖ Processamento conclu√≠do! Dados salvos em: {output_file}")


‚úÖ Processamento conclu√≠do! Dados salvos em: F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final\estatutos_resultados_2021.xlsx


EXTRA√á√ÉO - 2022

In [16]:
import os
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import re
from math import sqrt
from pathlib import Path

# üîπ Carregar dicion√°rio de palavras dif√≠ceis para New Dale-Chall Score
with open("F:\\BACKUP GERAL 01.06.2020\\UFSC\\2025\\Editais\\Concurso COLLABCOOP\\DICION√ÅRIO\\dicionario_palavras_dificeis.txt", "r", encoding="utf-8") as f:
    palavras_dificeis = set(f.read().splitlines())

# üîπ Fun√ß√£o para calcular √≠ndices de legibilidade
def calculate_readability_indices(text):
    patterns_to_remove = [
        r'^\s*T√çTULO\s+[IVXLCDM\d]+',
        r'^\s*CAP√çTULO\s+[IVXLCDM\d]+',
        r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*SUBSE√á√ÉO\s+[IVXLCDM\d]+',
        r'^\s*Art\.?\s*\d+[¬∫¬∞]?',
        r'^\s*¬ß\s*\d+[¬∫¬∞]?',
        r'^\s*Par√°grafo √∫nico\.',
    ]

    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    sentences = [
        sentence.strip() for sentence in sentences
        if sentence.strip() and not any(re.match(pattern, sentence.strip(), re.IGNORECASE) for pattern in patterns_to_remove)
    ]
    S = max(len(sentences), 1)

    words = text.split()
    long_words = [w for w in words if len(w) > 6]
    complex_words = [w for w in words if len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{2,}', w, re.I)) > 1]
    syllables = sum(len(re.findall(r'[aeiou√°√©√≠√≥√∫√¢√™√Æ√¥√ª√£√µ√†]{1,2}', w, re.I)) for w in words)

    W = len(words)
    LW = len(long_words)
    CW = len(complex_words)
    C = sum(len(word) for word in words if word.isalpha())

    paragraph_count = len(re.findall(r'\n\s*\n', text))
    if paragraph_count == 0:
        paragraph_count = max(S // 3, 1)

    flesch = round(0.39 * (W / S) + 11.8 * (syllables / W) - 15.59, 2) if W > 0 and S > 0 else 0
    ari = round(4.71 * (C / W) + 0.5 * (W / S) - 21.43, 2) if W > 0 and S > 0 else 0
    lix = round((100 * LW / W) + (W / S), 2) if W > 0 else 0
    rix = round(LW / S, 2) if S > 0 else 0
    fog = round(0.4 * ((W / S) + (40 * CW / W)), 2) if W > 0 and S > 0 else 0
    smog = round(1.043 * sqrt(30 * (CW / S)) + 3.1291, 2) if S > 0 and CW > 0 else 0

    difficult_words = sum(1 for w in words if w.lower() in palavras_dificeis)
    ndc = round(0.1579 * (difficult_words / W * 100) + 0.0496 * (W / S) + 3.6365, 2) if W > 0 and S > 0 else 0

    ttr = round(len(set(words)) / W, 4) if W > 0 else 0
    herdan_c = round(sqrt(len(set(words)) / W), 4) if W > 0 else 0

    return {
        'LIX Index': lix,
        'RIX Index': rix,
        'ARI Index': ari,
        'Flesch-Kincaid': flesch,
        'Fog Index': fog,
        'SMOG Index': smog,
        'New Dale-Chall': ndc,
        'Type-Token Ratio (TTR)': ttr,
        'Herdan‚Äôs C': herdan_c,
        'Paragraph Count': paragraph_count
    }

# üîπ Fun√ß√£o para contar se√ß√µes, cap√≠tulos e artigos
def count_sections(text):
    title_count = len(re.findall(r'^\s*T√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    chapter_count = len(re.findall(r'^\s*CAP√çTULO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    section_count = len(re.findall(r'^\s*SE√á√ÉO\s+[IVXLCDM\d]+\s*$', text, re.MULTILINE | re.IGNORECASE))
    article_count = len(re.findall(r'^\s*Art\.?\s*\d+', text, re.MULTILINE | re.IGNORECASE))

    return {
        'Title Count': title_count,
        'Chapter Count': chapter_count,
        'Section Count': section_count,
        'Article Count': article_count
    }

# üîπ Fun√ß√£o para extrair CNPJ
def extract_cnpj(text):
    match = re.search(r'CNPJ\D*(\d{2}\.?\d{3}\.?\d{3}/?\d{4}-?\d{2})', text)
    return match.group(1).replace(".", "").replace("/", "").replace("-", "") if match else None

# üîπ Fun√ß√£o para calcular estat√≠sticas do texto
def get_text_stats(text):
    words = text.split()
    sentences = re.split(r'(?<=[.!?])\s+|\nArt\.\s*\d+', text)
    character_count = sum(len(word) for word in words)
    unique_words = len(set(words))
    readability_indices = calculate_readability_indices(text)
    section_counts = count_sections(text)

    return {
        'Word Count': len(words),
        'Sentence Count': len(sentences),
        'Character Count': character_count,
        'Unique Words': unique_words,
        **readability_indices,
        **section_counts
    }

# üîπ Processar os PDFs
year = 2022
directory_path = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2024\Artigo 2 - TESE\Estatutos\{year}"
output_directory = fr"F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final"
os.makedirs(output_directory, exist_ok=True)
output_file = os.path.join(output_directory, f"estatutos_resultados_{year}.xlsx")

data = []
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    if filename.endswith(".pdf") and os.path.isfile(file_path):
        text = extract_text_from_pdf(file_path)
        if not text:
            continue
        cnpj = extract_cnpj(text)
        stats = get_text_stats(text)
        stats['File Name'] = filename
        stats['CNPJ'] = cnpj
        stats['Page Count'] = len(pdfplumber.open(file_path).pages)
        stats['File Size (KB)'] = round(Path(file_path).stat().st_size / 1024, 2)
        data.append(stats)

result_df = pd.DataFrame(data)
column_order = ['File Name', 'CNPJ'] + [col for col in result_df.columns if col not in ['File Name', 'CNPJ']]
result_df = result_df[column_order]

# üîπ Salvar no Excel
result_df.to_excel(output_file, index=False, engine="openpyxl")
print(f"‚úÖ Processamento conclu√≠do! Dados salvos em: {output_file}")


‚úÖ Processamento conclu√≠do! Dados salvos em: F:\BACKUP GERAL 01.06.2020\UFSC\2025\Editais\Concurso COLLABCOOP\Output final\estatutos_resultados_2022.xlsx


In [3]:
jupyter nbconvert --to docx "Extract final.ipynb"


SyntaxError: invalid syntax (4079094119.py, line 1)