# NLP Ansatz, um Posts zu Differenzieren

Folgend werden Posts geclustert und unterschiedlichen Kategorien zugewiesen.

### Requirements

In [4]:
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import matplotlib.dates as pdt
import re

### Get Data from Subreddit DB

In [None]:
df = pd.read_pickle('data/dfr.pkl')
df = df[["id, title"]].copy()

# NLP STARTS HERE

In [8]:
df.head()

Unnamed: 0,id,title
0,101,[PRO/CHEF] Texas Barbecue Platter
1,102,[homemade] Whole lemon cake doughnuts with whi...
2,103,[Homemade] Calamari
3,104,[homemade] raspberry cream cake covered in Legos
4,105,[homemade] “yogurt eggs and apple fries”


In [9]:
# Auf Windows ausführen (einmalig)
# !pip install tmtoolkit
# !pip install nltk
# !pip install pyLDAvis
# !pip install
# !pip install tqdm
# !pip install spacy==2.3.5
#!python3 -m spacy download en_core_web_sm # sonst googlen (kommt auf das env setup an)

# one time run to make this notebook work
# import nltk
# nltk.download('averaged_perceptron_tagger')

In [10]:
# Auf Linux ausführen (einmalig)
"""
!pip3 install tmtoolkit
!pip3 install nltk
!pip3 install pyLDAvis
!pip3 install
!pip3 install tqdm
!pip3 install -U pip setuptools wheel
!pip3 install -U spacy
!python -m spacy download en_core_web_sm # sonst googlen (kommt auf das env setup an)
"""
# one time run to make this notebook work
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jakob/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [11]:
import os
import sys

# maths
import numpy as np

# regular expressions to manipulate text
import re
import string

# NLP
from tmtoolkit.preprocess import TMPreproc
from tmtoolkit.corpus import Corpus
# from tmtoolkit.defaults import language
from tmtoolkit.topicmod.tm_lda import evaluate_topic_models
from tmtoolkit.topicmod.evaluate import results_by_parameter
from tmtoolkit.topicmod.visualize import plot_eval_results
from tmtoolkit.topicmod.model_io import save_ldamodel_summary_to_excel
from tmtoolkit.topicmod.visualize import parameters_for_ldavis
from tmtoolkit.topicmod.model_io import save_ldamodel_to_pickle
from tmtoolkit.topicmod.model_io import ldamodel_top_topic_words
from tmtoolkit.topicmod.model_stats import marginal_topic_distrib
from tmtoolkit.bow.bow_stats import doc_lengths
from tmtoolkit.topicmod.model_stats import generate_topic_labels_from_top_words

import spacy

#import en_core_web_sm

import pyLDAvis

import logging
import warnings

# progress bars
from tqdm import tqdm

### Cleaning

In [12]:
## simple preprocessing

## Notiz an mich selbst 03.04.2020
## Nachschauen, wie regular expressions jetzt funktionieren
## 3.5. falscher Iterator

print('amount of data rows before: {}'.format(len(df)))

def cleaning(text):
    # remove url's
    # from https://docs.microsoft.com/en-us/previous-versions/msp-n-p/ff650303(v=pandp.10)?redirectedfrom=MSDN
    text = re.sub("^(ht|f)tp(s?)\:\/\/[0-9a-zA-Z]([-.\w]*[0-9a-zA-Z])*(:(0-9)*)*(\/?)([a-zA-Z0-9\-\.\?\,\'\/\\\+&amp;%\$#_]*)?$",                   '', text)
    # mail adresses
    text = re.sub('\S+@\S+', '', text)
    # web pages
    text = re.sub('\S+.com', '', text)
    # digits
    text = re.sub('\w*\d\w*', '', text)
    # non-sensical text
    text = re.sub('\n', ' ', text)
    text = re.sub('\r', ' ', text)
    text = re.sub('\\xa0', ' ', text)
    # german letters
    text = re.sub('ö','oe',text)
    text = re.sub('Ö','Oe',text)
    text = re.sub('ü','ue',text)
    text = re.sub('Ü','Ue',text)
    text = re.sub('ä','ae',text)
    text = re.sub('Ä','Ae',text)    
    text = re.sub('ß','ss',text)
    # removing extra white spaces
    text = " ".join(text.split())
    return text
df["title"] = df["title"].apply(cleaning)
    
df = df.dropna()
print('amount of data rows after: {}'.format(len(df)))

  text = re.sub("^(ht|f)tp(s?)\:\/\/[0-9a-zA-Z]([-.\w]*[0-9a-zA-Z])*(:(0-9)*)*(\/?)([a-zA-Z0-9\-\.\?\,\'\/\\\+&amp;%\$#_]*)?$",                   '', text)
  text = re.sub('\S+@\S+', '', text)
  text = re.sub('\S+.com', '', text)
  text = re.sub('\w*\d\w*', '', text)


amount of data rows before: 360300
amount of data rows after: 360300


In [13]:
df.head()

Unnamed: 0,id,title
0,101,[PRO/CHEF] Texas Barbecue Platter
1,102,[homemade] Whole lemon cake doughnuts with whi...
2,103,[Homemade] Calamari
3,104,[homemade] raspberry cream cake covered in Legos
4,105,[homemade] “yogurt eggs and apple fries”


### Training

In [14]:
## globals

parent_path = os.path.dirname(os.getcwd())
language = 'english'
maximum_rows_used = 10000

# suppress the "INFO" messages and warnings from lda
logger = logging.getLogger('lda')
logger.addHandler(logging.NullHandler())
logger.propagate = False

warnings.filterwarnings('ignore')

stop = ['homemade']

In [15]:
parent_path

'/home/jakob/Dokumente/2_uni/sem6/data_warehouse'

In [16]:
## formatting for tmtoolkit
d = df.set_index('id')['title'].head(maximum_rows_used).to_dict()

In [17]:
d

{101: '[PRO/CHEF] Texas Barbecue Platter',
 102: '[homemade] Whole lemon cake doughnuts with white chocolate ganache, lemon curd drizzle and white chocolate shavings',
 103: '[Homemade] Calamari',
 104: '[homemade] raspberry cream cake covered in Legos',
 105: '[homemade] “yogurt eggs and apple fries”',
 106: '[Homemade] Strawberry cheesecake',
 107: '[Homemade] Eggs and Everything',
 108: '[I ate] Korean corndogs!',
 109: '[HOMEMADE] Fudge brownies with dark chocolate chunks, M&Ms, and flaked smoked sea salt',
 110: '[I ate] Chicken wings, falafel, mozzarella sticks and bettered cod',
 111: '[Homemade] Bacon Jalapeno Cheddar Cheese melt',
 112: '[Homemade] Post Oak Smoked Beef Brisket',
 113: '[Homemade] Powdered lemon bars',
 114: '[Homemade] Flemish beef stew with fries',
 115: '[homemade] Buffalo Wings',
 116: '[Homemade] tear and share cakes by the misses.',
 117: '[Homemade] Chicken Tenders',
 118: '[Homemade] Chocolate cake with nutella buttercream and nutella ganache',
 119: '[

In [18]:
#tmtoolkit preprocessing
## multiple preprocessing steps to be computed and compared against each other

corpus = Corpus(docs=d)
print(corpus)
#corpus = corpus.sample(50)

Corpus with 10000 documents


In [19]:
from tmtoolkit.preprocess import init_for_language, tokenize

doc_labels = corpus.doc_labels   # save the document labels as list for later use

init_for_language('en')   # we use an English corpus
docs = tokenize(list(corpus.values()))

In [20]:
import logging

logging.basicConfig(level=logging.INFO)
tmtoolkit_log = logging.getLogger('tmtoolkit')
# set the minimum log level to display, for instance also logging.DEBUG
tmtoolkit_log.setLevel(logging.INFO)
tmtoolkit_log.propagate = True

In [21]:
pp0 = TMPreproc(corpus, language='en')
# del corpus

old_vocab = pp0.vocabulary
# print('vocabulary size before processing: {}'.format(pp0.vocabulary_size))

INFO:tmtoolkit:init with 10000 documents
INFO:tmtoolkit:init with max. 4 workers
INFO:tmtoolkit:distributing work via greedy partitioning
INFO:tmtoolkit:setting up 4 worker processes
INFO:tmtoolkit:tokenizing 2500 documents
INFO:tmtoolkit:tokenizing 2500 documents
INFO:tmtoolkit:tokenizing 2500 documentsINFO:tmtoolkit:tokenizing 2500 documents

Process _PreprocWorker#1:
Process _PreprocWorker#3:
Process _PreprocWorker#0:
Traceback (most recent call last):
Process _PreprocWorker#2:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/jakob/.local/lib/python3.6/site-packages/tmtoolkit/preprocess/_preprocworker.py", line 54, in run
    q_item =

In [22]:
pp0.vocabulary_counts.most_common()[:10]

[('[', 10000),
 (']', 9808),
 ('Homemade', 6776),
 ('and', 3203),
 (',', 2879),
 ('with', 1890),
 ('homemade', 1699),
 ('.', 1456),
 ('chicken', 802),
 ('I', 797)]

In [23]:
## general simple preprocessing

pp0.pos_tag().lemmatize().tokens_to_lowercase().remove_special_chars_in_tokens()

general_vocab = pp0.vocabulary

print('vocabulary size after processing: {}'.format(pp0.vocabulary_size))
print('vocabulary, that was cut during this step:')
print(set(old_vocab) - set(general_vocab))
# del old_vocab

INFO:tmtoolkit:POS tagging tokens
INFO:tmtoolkit:lemmatizing tokens
INFO:tmtoolkit:transforming tokens to lowercase
INFO:tmtoolkit:removing characters in tokens


vocabulary size after processing: 796
vocabulary, that was cut during this step:
{'Calamari', 'Strawberry', 'Oatmeal', 'Pastor', 'is', 'Popper', 'am', 'Tartar', 'Ghost', 'Walnut', 'LT', 'Irish', 'Dip', 'Legos', 'Turkey', 'days', 'Wing', 'Tuna', 'Sugar', 'Almond', 'Hamburger', 'Meatball', 'Avocado', 'Sandwich', 'Tartlets', 'Filipino', 'Shortbread', 'Nut', 'slices', 'shallots', 'Colby', 'Maryland', 'Berry', 'Balsamic', 'made', 'potstickers', 'styled', 'Dinner', 'Beignets', 'Seasoned', 'Layered', 'Spicy', 'Huli', '!', 'Lentil', 'Cheesecake', 'Hanetsuki', 'Twix', 'Barbecue', 'shared', 'sharing', 'crunchies', 'Pilaf', 'Free', 'Ring', 'Blueberry', 'minutes', 'Hot', 'Chicken', 'Dill', 'Homemade]Cauliflower', 'Tilapia', 'dumplings', 'Fresh', 'Prime', 'Wood', 'Coppa', 'Bowls', 'Surf', 'Homemade', 'Popeye', 'Fiesta', 'coated', 'Brie', 'Mongolian', 'Roast', 'Grilled', 'Antiguan', 'Cooked', 'Seared', 'Slice', 'Smash', 'Nathan', 'Chirashi', 'Rangoon', 'Homemade]Buttermilk', 'Minecraft', '+', 'wraps

In [24]:
## specific preprocessing

pp_aggressive = pp_nouns = pp_noadditionalstopwords = pp_staticstopwords = pp_dynamicstopwords = pp0.copy()
doc_labels = np.array(pp0.doc_labels)


pp_dynamicstopwords.clean_tokens(remove_numbers=True, remove_shorter_than=2) \
    .remove_common_tokens(df_threshold=0.9) \
    .remove_uncommon_tokens(df_threshold=0.1)

pp_nouns.filter_for_pos('N') \
    .add_stopwords(stop) \
    .clean_tokens(remove_numbers=True, remove_shorter_than=3) \
    .remove_common_tokens(df_threshold=0.85) \
    .remove_uncommon_tokens(df_threshold=0.05)

pp_aggressive.filter_for_pos('N') \
    .add_stopwords(stop) \
    .clean_tokens(remove_numbers=True, remove_shorter_than=3) \
    .remove_common_tokens(df_threshold=0.9) \
    .remove_uncommon_tokens(df_threshold=0.1)

INFO:tmtoolkit:init with max. 4 workers
INFO:tmtoolkit:loading state from object
INFO:tmtoolkit:setting up 4 worker processes with initial states
INFO:tmtoolkit:cleaning tokens
INFO:tmtoolkit:filtering tokens by mask
INFO:tmtoolkit:filtering tokens for POS tag `N`
INFO:tmtoolkit:cleaning tokens
INFO:tmtoolkit:filtering tokens for POS tag `N`
INFO:tmtoolkit:cleaning tokens


<TMPreproc [10000 documents / en]>

INFO:tmtoolkit:creating sparse DTM for 2500 documents
INFO:tmtoolkit:creating sparse DTM for 2500 documentsINFO:tmtoolkit:creating sparse DTM for 2500 documentsINFO:tmtoolkit:creating sparse DTM for 2500 documents


Process _PreprocWorker#1:
Process _PreprocWorker#3:
Traceback (most recent call last):
Process _PreprocWorker#0:
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Process _PreprocWorker#2:
  File "/home/jakob/.local/lib/python3.6/site-packages/tmtoolkit/preprocess/_preprocworker.py", line 54, in run
    q_item = self.tasks_queue.get()
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 94, in get
    res = self._recv_bytes()
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._rec

In [25]:
#vocab
vocab_dynamicstopwords = np.array(pp_dynamicstopwords.vocabulary)
vocab_nouns = np.array(pp_nouns.vocabulary)
vocab_aggressive = np.array(pp_aggressive.vocabulary)

#dtm
dtm_dynamicstopwords = pp_dynamicstopwords.dtm
dtm_nouns = pp_nouns.dtm
dtm_aggressive = pp_aggressive.dtm
del pp_dynamicstopwords, pp_nouns, pp_aggressive

INFO:tmtoolkit:generating DTM


In [26]:
# set data and hyperparameters to be computed

dtms = {
    'model_dynamicstopwords': dtm_dynamicstopwords,
    'model_nouns': dtm_nouns,
    'model_aggressive': dtm_aggressive
}

# var_params = [{'n_topics': k, 'alpha': 1/k, 'eta': 0.028*k,'n_iter': 200*k} for k in range(3, 20)]
# var_paramsas = [{'n_topics': k, 'alpha': 1/k, 'eta': 0.028*k,'n_iter': 200*k} for k in range(20, 3, -1)]
# var_params.extend(var_paramsas)

var_params = [{'n_topics': k, 'alpha': 1/k, 'eta': 0.028*k} for k in range(3, 5)]

const_params = {
    'n_iter': 1000,
    'random_state': 69  # to make results reproducible
}

var_params

[{'n_topics': 3, 'alpha': 0.3333333333333333, 'eta': 0.084},
 {'n_topics': 4, 'alpha': 0.25, 'eta': 0.112}]

In [27]:
df.head()

Unnamed: 0,id,title
0,101,[PRO/CHEF] Texas Barbecue Platter
1,102,[homemade] Whole lemon cake doughnuts with whi...
2,103,[Homemade] Calamari
3,104,[homemade] raspberry cream cake covered in Legos
4,105,[homemade] “yogurt eggs and apple fries”


In [31]:
dtms.items()

dict_items([('model_dynamicstopwords', <10000x1 sparse matrix of type '<class 'numpy.int32'>'
	with 1361 stored elements in Compressed Sparse Row format>), ('model_nouns', <10000x1 sparse matrix of type '<class 'numpy.int32'>'
	with 1361 stored elements in Compressed Sparse Row format>), ('model_aggressive', <10000x1 sparse matrix of type '<class 'numpy.int32'>'
	with 1361 stored elements in Compressed Sparse Row format>)])

In [34]:
for dtm_name, dtm in dtms.items():
    print(dtm)

  (9, 0)	1
  (16, 0)	1
  (20, 0)	1
  (29, 0)	1
  (30, 0)	1
  (33, 0)	1
  (41, 0)	1
  (82, 0)	2
  (84, 0)	1
  (86, 0)	1
  (91, 0)	1
  (109, 0)	1
  (116, 0)	1
  (120, 0)	1
  (129, 0)	1
  (130, 0)	1
  (135, 0)	1
  (141, 0)	1
  (182, 0)	2
  (184, 0)	1
  (186, 0)	1
  (190, 0)	1
  (209, 0)	1
  (216, 0)	1
  (220, 0)	1
  :	:
  (9847, 0)	1
  (9864, 0)	1
  (9870, 0)	1
  (9873, 0)	1
  (9876, 0)	1
  (9879, 0)	1
  (9880, 0)	1
  (9891, 0)	1
  (9893, 0)	1
  (9897, 0)	1
  (9905, 0)	1
  (9927, 0)	1
  (9931, 0)	1
  (9942, 0)	1
  (9945, 0)	1
  (9949, 0)	1
  (9964, 0)	1
  (9970, 0)	1
  (9973, 0)	1
  (9978, 0)	1
  (9983, 0)	1
  (9984, 0)	1
  (9991, 0)	1
  (9993, 0)	1
  (9994, 0)	1
  (9, 0)	1
  (16, 0)	1
  (20, 0)	1
  (29, 0)	1
  (30, 0)	1
  (33, 0)	1
  (41, 0)	1
  (82, 0)	2
  (84, 0)	1
  (86, 0)	1
  (91, 0)	1
  (109, 0)	1
  (116, 0)	1
  (120, 0)	1
  (129, 0)	1
  (130, 0)	1
  (135, 0)	1
  (141, 0)	1
  (182, 0)	2
  (184, 0)	1
  (186, 0)	1
  (190, 0)	1
  (209, 0)	1
  (216, 0)	1
  (220, 0)	1
  :	:
  (9847, 0)	

In [None]:
## compute all topic models and plot evaluation results

# for dtm_name, dtm in dtms.items():
dtm = dtms["model_aggressive"]   
print('current dtm: {}'.format(dtm_name))

eval_results = evaluate_topic_models(dtm,
                                    varying_parameters=var_params,
                                    constant_parameters=const_params,
                                    return_models=False)

#     print('plotting...')
    
#     eval_results_by_topics = results_by_parameter(eval_results, 'n_topics')
#     plot_eval_results(eval_results_by_topics, xaxislabel=dtm_name + ' | n_topics');
    

#     eval_results_by_topics = results_by_parameter(eval_results, 'n_iter')
#     plot_eval_results(eval_results_by_topics, xaxislabel=dtm_name + ' | n_iter');
    

#     eval_results_by_topics = results_by_parameter(eval_results, 'alpha')
#     plot_eval_results(eval_results_by_topics, xaxislabel=dtm_name + ' | alpha');
    

#     eval_results_by_topics = results_by_parameter(eval_results, 'eta')
#     plot_eval_results(eval_results_by_topics, xaxislabel=dtm_name + ' | eta');

#     print('{} done - waiting for plots'.format(dtm_name))

INFO:tmtoolkit:initializing evaluation with sparse matrix of format `coo` and shape 10000x1
INFO:tmtoolkit:init with 2 workers


current dtm: model_aggressive


INFO:tmtoolkit:multiproc models: starting with 2 parameter sets on 1 documents (= 2 tasks) and 2 processes
INFO:tmtoolkit:fitting LDA model from package `lda` with parameters: {'n_topics': 3, 'alpha': 0.3333333333333333, 'eta': 0.084, 'n_iter': 1000, 'random_state': 69}
INFO:tmtoolkit:fitting LDA model from package `lda` with parameters: {'n_topics': 4, 'alpha': 0.25, 'eta': 0.112, 'n_iter': 1000, 'random_state': 69}
INFO:tmtoolkit:> evaluation result with metric "cao_juan_2009": 1.000000
INFO:tmtoolkit:> evaluation result with metric "arun_2010": 61.677187
Process <class 'tmtoolkit.topicmod.tm_lda.MultiprocEvaluationWorkerLDA'>#0:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/jakob/.local/lib/python3.6/site-packages/tmtoolkit/topicmod/parallel.py", line 284, in run
    results = self.fit_model(data, params)
  File "/home/jakob/.local/lib/python3.6/site-packages/tmtoolkit/topicmod/tm_lda.p

### Model Selection

In [None]:
# ## best parameter sets for our different dtms 
# '''

# --- notes ---
# alpha: per document topic distribution (higher)
# eta: per topic word distribution (lower)

# '''
# # select dtm for following steps 
# dtm = dtms['model_staticstopwords']
# vocab = vocab_staticstopwords
# print('selected model removed the following vocabulary:')
# print(set(general_vocab) - set(vocab))
# #del general_vocab, dtms

In [None]:
# # recompute chosen model
# from tmtoolkit.topicmod.tm_lda import compute_models_parallel

# topicn = 10

# lda_params = {
#     'n_topics': topicn,
#     'n_iter': 3750,
#     'alpha': 0.1,
#     'eta': 0.5,
#     'random_state': 20200701
# }

# model = compute_models_parallel(dtm, constant_parameters=lda_params)
# model

In [None]:
# ## select model from nested list

# model = model[0][1]
# model

In [None]:
# ## naming topics
# ## most 'important' topics for the whole corpus determined by the marginal topic distribution
# ## many more options to have a look into topics

# doc_lengths = doc_lengths(dtm)
# topic_labels = generate_topic_labels_from_top_words(
#     model.topic_word_,
#     model.doc_topic_,
#     doc_lengths,
#     vocab,
#     lambda_=0.6
# )

# top_topic_word = ldamodel_top_topic_words(model.topic_word_,
#                                           vocab,
#                                           row_labels=topic_labels)

# marg_topic = marginal_topic_distrib(model.doc_topic_, doc_lengths)

# # np.argsort() gives ascending order, hence reverse via [::-1]
# print('most important topics: \n{}'.format(topic_labels[np.argsort(marg_topic)[::-1][:topicn]]))