In [1]:
import re
import pandas as pd
from wordcloud import WordCloud
import nltk
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [2]:
def clean(text):
    text = text.lower()
    text = re.sub(
        r"https?:\/\/(www\.)? ?[-a-zA-Z0-9@:%._\+~#=]{1,256}\. ?[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)",
        "",
        text,
    )
    text = re.sub(r"(it | he | she | that)'s", '\1 is', text)
    text = re.sub(r"(they | we | you)'re", '\1 are', text)
    text = re.sub(r"youre", 'you are', text)
    text = re.sub(r"(they | we | you)'ve", '\1 have', text)
    text = re.sub(r"this'", 'this is', text)
    text = re.sub(r"http", ' ', text)
    text = re.sub(r"i[']m", 'i am', text)
    text = re.sub(r"didn[']t", 'did not', text)
    text = re.sub(r"don[']t", 'do not', text)
    text = re.sub(r"don[']t", 'do not', text)
    text = re.sub(r"can't", 'cannot', text)
    text = re.sub(r"&.+;", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [3]:
df = pd.read_csv('irony-labeled.csv')

In [4]:
df['comment_text'] = df['comment_text'].apply(clean)

In [5]:
df.tail(3)

Unnamed: 0,comment_text,label
1946,[... what? ](,-1
1947,does anybody remember during one of the debate...,-1
1948,the pope is meeting a cruel dictator. likely w...,1


In [6]:
from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')
STOPWORDS.extend(['im', 'sourcehttpwww'])
STOPWORDS = set(STOPWORDS)

In [7]:
def preprocess(sent):
    def convert(word):
        word = re.sub(r"\W+", "", word)
        # Verifica se é um número.
        try:
            _ = float(word)
            return '<num>'
        except:
            pass

        # Verifica se é uma palavra.
        if word.isalpha():
            lower = word.lower()
            return '<stop>' if lower in STOPWORDS else lower

        # Caso contrário, é pontuação ou estranho.
        return '<weird>'

    processed = [convert(word) for word in sent]
    forbidden_words = set(('<num>', '<stop>', '<weird>'))
    return [word for word in processed if word not in forbidden_words]

In [8]:
sents_all = [preprocess(item.strip().split()) for item in df.comment_text]
df['word_list'] = sents_all
df.head(3)

Unnamed: 0,comment_text,label,word_list
0,i suspect atheists are projecting their desire...,-1,"[suspect, atheists, projecting, desires, imagi..."
1,it's funny how the arguments the shills are ma...,-1,"[funny, arguments, shills, making, still, clos..."
2,we are truly following the patterns of how the...,-1,"[truly, following, patterns, mandarins, took, ..."


### LDA

In [9]:
from gensim.corpora import Dictionary
import gensim
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [10]:
#Generate LDA MODEL
from gensim.models.ldamulticore import LdaMulticore, LdaModel
def model_dict(sents, NUM_TOPICS):
    dictionary = Dictionary(sents)
    corpus = [dictionary.doc2bow(sent) for sent in sents]

    ldamodel = LdaMulticore(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=10, random_state=100, iterations=25)

    return corpus, dictionary, ldamodel

In [11]:
#Check topics

def check_topics(corpus, lda_model):
    topics = lda_model.get_document_topics(corpus, per_word_topics=True)


    doc_topics, word_topics, phi_values = topics[10]
    print('Document topic:', doc_topics, "\n")

    for topic, strength in doc_topics:
        print(f'Topico: {topic}')
        print(lda_model.print_topic(topic, topn=20))

    topics = lda_model.print_topics(num_words=10)
    for topic in topics:
        print(topic)

In [12]:
def display_model(corpus, dictionary, ldamodel):  
    lda_display = pyLDAvis.gensim_models.prepare(
        ldamodel,
        corpus,
        dictionary,
        sort_topics=False,
    )
    return pyLDAvis.display(lda_display)

In [13]:
from gensim.models import CoherenceModel

def compute_coherence_values(dictionary, corpus, texts,  k, a, b,start=2, step=3):
    # adaptado de https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

    model = LdaMulticore(corpus = corpus, num_topics=k, id2word=dictionary, random_state=100, passes=10, iterations=25, alpha=a, eta=b, chunksize=100)

    coherencemodel = CoherenceModel(model=model, corpus=corpus, texts=texts, dictionary=dictionary, coherence='u_mass')

    return coherencemodel.get_coherence()

In [14]:
dictionary_all = Dictionary(sents_all)
corpus_all = [dictionary_all.doc2bow(sent) for sent in sents_all]

In [15]:
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 124
max_topics = 125
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus_all)
corpus_sets = [corpus_all]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(dictionary=dictionary_all, corpus=corpus_sets[i], texts=sents_all,k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

  0%|                                                                                          | 0/540 [00:00<?, ?it/s]ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "c:\users\mathe\appdata\local\programs\python\python38\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-552963b5a2b5>", line 40, in <module>
    cv = compute_coherence_values(dictionary=dictionary_all, corpus=corpus_sets[i], texts=sents_all,k=k, a=a, b=b)
  File "<ipython-input-13-28060efcce8f>", line 6, in compute_coherence_values
    model = LdaMulticore(corpus = corpus, num_topics=k, id2word=dictionary, random_state=100, passes=10, iterations=25, alpha=a, eta=b, chunksize=100)
  File "c:\users\mathe\appdata\local\programs\python\python38\lib\site-packages\gensim\models\ldamulticore.py", line 179, in __init__
    super(LdaMulticore, self).__init__(
  File "c:\users\mathe\appdata\local\programs\python\python38\lib\site-packages\gensim\models\ldamodel.py", line 523, in __init__
    self.update(corpus, chunks_as_numpy=use_numpy)
  File "c:\

KeyboardInterrupt: 

interrompido, pois já temos o csv