In [1]:
import gensim
import nltk
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from pypdf import PdfReader

def get_topics_from_pdf(file, num_topics, words_per_topic):
    """
    Uses LDA algoritm for topic discovery
    Returns: list of num_topics lists with relevant words for each topic (nested list).
    """


    loader = PdfReader(file)

    documents= []
    for page in loader.pages:
        documents.append(page.extract_text())

    # Preprocess the documents
    nltk.download('stopwords')
    stop_words = set(stopwords.words(['english','spanish']))

    def preprocess(text):
        result = []
        for token in simple_preprocess(text, deacc=True):
            if token not in stop_words and len(token) > 3:
                result.append(token)
        return result

    processed_documents = [preprocess(doc) for doc in documents]

    # Create a dictionary and a corpus
    dictionary = corpora.Dictionary(processed_documents)
    corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

    # Build the LDA model
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

    # Print the topics and their corresponding words
    topics = lda_model.print_topics(num_words=words_per_topic)
    topics_ls = []
    for topic in topics:
        words = topic[1].split("+")
        topic_words = [word.split("*")[1].replace('"', '').strip() for word in words]
        topics_ls.append(topic_words)
        
    return topics_ls

In [2]:
from src import get_topics_from_pdf
file = "Philip Kotler, Gary Armstrong - Marketing_ Versión latinoamerica (2007).pdf"
file = "PROGRAMA ANALITICO MÓDULO.pdf"
num_topics = 7
words_per_topic = 30

list_of_topicwords = get_topics_from_pdf(file, num_topics, words_per_topic)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton_glfk00f\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import gensim
import nltk
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from pypdf import PdfReader

#file = "Philip Kotler, Gary Armstrong - Marketing_ Versión latinoamerica (2007).pdf"
file = "Philip Kotler, Gary Armstrong, Sridhar Balasubramanian - Principles of Marketing-Pearson (2023).pdf"
loader = PdfReader(file)

documents= []
for page in loader.pages:
    documents.append(page.extract_text())

# Preprocess the documents
nltk.download('stopwords')
stop_words = set(stopwords.words(['english','spanish']))

def preprocess(text):
    result = []
    for token in simple_preprocess(text, deacc=True):
        if token not in stop_words and len(token) > 3:
            result.append(token)
    return result

processed_documents = [preprocess(doc) for doc in documents]

# Create a dictionary and a corpus
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

# Build the LDA model
lda_model = LdaModel(corpus, num_topics=7, id2word=dictionary, passes=15)

# Print the topics and their corresponding words
topics = lda_model.print_topics(num_words=30)
topics_ls = []
for topic in topics:
    words = topic[1].split("+")
    topic_words = [word.split("*")[1].replace('"', '').strip() for word in words]
    print(topic_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton_glfk00f\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['marketing', 'brand', 'advertising', 'sales', 'media', 'campaign', 'product', 'company', 'people', 'world', 'indd', 'costs', 'cost', 'percent', 'digital', 'million', 'market', 'social', 'example', 'brands', 'also', 'even', 'year', 'consumers', 'products', 'content', 'time', 'consumer', 'like', 'customer']
['price', 'pricing', 'prices', 'product', 'market', 'products', 'company', 'value', 'customers', 'companies', 'consumers', 'business', 'buying', 'indd', 'example', 'many', 'customer', 'competitors', 'consumer', 'even', 'high', 'must', 'marketing', 'based', 'buyers', 'brand', 'strategy', 'costs', 'demand', 'also']
['customer', 'sales', 'customers', 'brand', 'product', 'products', 'online', 'marketing', 'consumers', 'company', 'brands', 'store', 'digital', 'service', 'example', 'indd', 'value', 'consumer', 'companies', 'many', 'also', 'retailers', 'media', 'mobile', 'social', 'selling', 'time', 'services', 'stores', 'people']
['https', 'accessed', 'marketing', 'september', 'news', 'dat

In [None]:
from src import get_topics_from_pdf
file = "Philip Kotler, Gary Armstrong - Marketing_ Versión latinoamerica (2007).pdf"
file = "PROGRAMA ANALITICO MÓDULO.pdf"
num_topics = 7
words_per_topic = 30

list_of_topicwords = get_topics_from_pdf(file, num_topics, words_per_topic)

In [None]:
my_string = str(my_list)