## Topic Modeling

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
ddf = dd.read_parquet("gcs://extreme-lore-398917-bzg/latest2/",
                      storage_options={'token': token})

In [None]:
# Remove numbers
docs = ddf.body.map(lambda x: re.sub(r'\d+', '', x), meta=pd.Series(dtype="string"))

In [None]:
docs.name="body"

### TF-IDF

In [None]:
from src.topic_modeling.helpers import tokenize, keyword_filter
import numpy as np

In [None]:
keywords_list1 = ["launch", "business", "strategy", "management", "product", "service", "app", "customer", "merge"]
keywords_list2 = ["upgrade", "downgrade", "raise", "cut", "buy", "sell", "hold", "outperform", "underperform", "analyst", "estimate"]
keywords_list3 = ["ebit", "eps", "earnings", "report", "financial", "quarter", "annual", "year", "ended", "net", "income"]
total_keywords = keywords_list1 + keywords_list2 + keywords_list3
len(total_keywords)

In [None]:
# Tokenization + rough filtering
tfidf_docs = docs.map(lambda x: keyword_filter(tokenize(x), total_keywords), meta=pd.Series(dtype="object"))
tfidf_docs = tfidf_docs.reset_index().repartition(npartitions=5)
tfidf_docs.columns = ["index", "body"]
tfidf_docs.to_parquet("gcs://extreme-lore-398917-bzg/tfidf-tokens",
                                                    storage_options={'token': token})

In [None]:
tfidf_docs = dd.read_parquet("gcs://extreme-lore-398917-bzg/tfidf-tokens/",
                                                    storage_options={'token': token},
                                                    dtype_backend="pyarrow")

In [None]:
tfidf_docs = tfidf_docs.compute()

In [None]:
# Numpy representation is without commas making us unable to convert strings to list via eval
tfidf_docs.loc[:, "body"] = tfidf_docs.body.map(lambda x: x.replace(" ", ", "))

In [None]:
def f(tokens):
    if len(tokens) == 0:
        return np.array(["wordtopreventemptydocumentswhencalculatingtfidf"])
    else:
        return np.array(tokens)
        
tfidf_docs.loc[:, "body"] = tfidf_docs.body.map(lambda x: f(eval(x)))

In [None]:
tfidf_docs.iloc[0].body

In [None]:
topic_tfidfs_list = []
for topic_tokenizer in [keywords1, keywords2, keywords3]:
    lazy_result = dask.delayed(TfidfVectorizer(tokenizer=topic_tokenizer, lowercase=False).fit_transform)(tfidf_docs.body)
    lazy_sums = dask.delayed(np.apply_along_axis)(np.sum, 1, lazy_result.todense())
    topic_tfidfs_list.append(lazy_sums)

In [None]:
topic_indicator_list = dask.compute(*topic_tfidfs_list)

In [None]:
df = pd.DataFrame(topic_indicator_list)
df=df.transpose()

In [None]:
tfidf_docs.head().iloc[4].body

In [None]:
df.head()

### LDA

In [None]:
from gensim.corpora import Dictionary
from gensim.models import Phrases
import nltk

In [None]:
# Split the documents into tokens.
lda_docs = docs.apply(lambda x: word_tokenize(str.lower(x)), meta=pd.Series(dtype="object"))

In [None]:
# Remove words that are only one character.
# Lemmatize the documents.
lemmatizer = WordNetLemmatizer()
lda_docs = lda_docs.apply(lambda x: [lemmatizer.lemmatize(token) for token in x if len(token) >1], meta=pd.Series(dtype="object"))

In [None]:
lda_docs.head()

In [None]:
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
def f(doc):
  doc = doc
  for token in bigram[doc]:
      if '_' in token:
          # Token is a bigram, add to document.
          doc.append(token)
  return doc
lda_docs = lda_docs.apply(lambda doc: f(doc))

In [None]:
# lda_docs.name = "body"
# lda_docs = lda_docs.to_frame().reset_index()
# lda_docs = lda_docs.reset_index().repartition()

In [None]:
# lda_docs.to_parquet(cwd + "/data/lda_docs", schema={"body": pa.string(),  "index":pa.int32()})
# lda_docs = dd.read_parquet(cwd + "/data/lda_docs")

In [None]:
# Remove rare and common tokens.

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=1000, no_above=0.6)

In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [None]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)