In [None]:
!pip install PyPDF2 nltk scikit-learn matplotlib

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import numpy as np
import matplotlib.pyplot as plt

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

ModuleNotFoundError: No module named 'nltk'

In [None]:
def read_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def preprocess_text(text):
    # Tokenize and lowercase
    tokens = word_tokenize(text.lower())

    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # Remove stop words and 'cars'
    stop_words = set(stopwords.words('portuguese'))
    stop_words.add('cars')
    tokens = [token for token in tokens if token not in stop_words]

    return tokens

def calculate_tfidf(processed_text):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([processed_text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = dict(zip(feature_names, tfidf_matrix.toarray()[0]))
    return tfidf_scores

def preprocess_pdf(text):
    # Preprocess text
    tokens = preprocess_text(text)

    # Print word count
    print(f"Total de palavras após pré-processamento: {len(tokens)}")

    # Join tokens back into a string for TF-IDF
    processed_text = ' '.join(tokens)

    # Calculate TF-IDF
    tfidf_scores = calculate_tfidf(processed_text)

    # Sort words by TF-IDF score
    sorted_words = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)

    # Print top 10 words by TF-IDF score
    print("\nTop 10 palavras por pontuação TF-IDF:")
    for word, score in sorted_words[:10]:
        print(f"{word}: {score}")

    return tokens, tfidf_scores

In [None]:
# Carregamento do arquivo PDF
pdf_path = 'seu_arquivo.pdf'  # Substitua pelo caminho do seu arquivo PDF
pdf_text = read_pdf(pdf_path)

# Pré-processamento do texto do PDF
tokens, tfidf_scores = preprocess_pdf(pdf_text)

In [None]:
# Pegar as 20 palavras mais importantes
top_words = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:20]
words, scores = zip(*top_words)

plt.figure(figsize=(12, 6))
plt.bar(words, scores)
plt.xticks(rotation=45, ha='right')
plt.title('Top 20 Palavras por Pontuação TF-IDF')
plt.xlabel('Palavras')
plt.ylabel('Pontuação TF-IDF')
plt.tight_layout()
plt.show()