In [1]:
import re
import pandas as pd
from wordcloud import WordCloud
import nltk
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [2]:
def clean(text):
    text = text.lower()
    text = re.sub(
        r"https?:\/\/(www\.)? ?[-a-zA-Z0-9@:%._\+~#=]{1,256}\. ?[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)",
        "",
        text,
    )
    text = re.sub(r"(it | he | she | that)'?s", '\1 is', text)
    text = re.sub(r"(they | we | you)'re", '\1 are', text)
    text = re.sub(r"(they | we | you)'?ve", '\1 have', text)
    text = re.sub(r"this'", 'this is', text)
    text = re.sub(r"http", ' ', text)
    text = re.sub(r"html", ' ', text)
    text = re.sub(r"didn'?t", 'did not', text)
    text = re.sub(r"don'?t", 'do not', text)
    text = re.sub(r"don'?t", 'do not', text)
    text = re.sub(r"can'?t", 'cannot', text)
    text = re.sub(r"&.+;", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [3]:
df = pd.read_csv('irony-labeled.csv')
df['comment_text'] = df['comment_text'].apply(clean)
df.tail(3)

Unnamed: 0,comment_text,label
1946,[... what? ](,-1
1947,does anybody remember during one of the debate...,-1
1948,the pope is meeting a cruel dictator. likely w...,1


In [4]:
from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')
STOPWORDS = set(STOPWORDS)

In [5]:
def preprocess(sent):
    def convert(word):
        word = re.sub(r"\W+", "", word)
        # Verifica se é um número.
        try:
            _ = float(word)
            return '<num>'
        except:
            pass

        # Verifica se é uma palavra.
        if word.isalpha():
            lower = word.lower()
            return '<stop>' if lower in STOPWORDS else lower

        # Caso contrário, é pontuação ou estranho.
        return '<weird>'

    processed = [convert(word) for word in sent]
    forbidden_words = set(('<num>', '<stop>', '<weird>'))
    return [word for word in processed if word not in forbidden_words]

In [6]:
sents_all = [preprocess(item.strip().split()) for item in df.comment_text]
df['word_list'] = sents_all
df.head(3)

Unnamed: 0,comment_text,label,word_list
0,i suspect atheists are projecting their desire...,-1,"[suspect, atheists, projecting, desires, imagi..."
1,it's funny how the arguments the shills are ma...,-1,"[funny, arguments, shills, making, still, clos..."
2,we are truly following the patterns of how the...,-1,"[truly, following, patterns, mandarins, took, ..."


In [7]:
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore

In [8]:
from gensim.models import CoherenceModel

# O random_state foi usado aqui para manter consistência na apresentação dos resultados. Ao remover o estado fixado, no entanto, os valores dos modelos de tópicos se mantém com a característica decrescente observada. 
# Ou seja, não importa quando estado fixado, os resultados sempre demonstram que modelos de 2 tópicos são os mais coerentes, seguidos dos de 3 tópicos, etc.

def compute_coherence_values(dictionary, corpus, texts,  k, a, b,start=2, step=3):

    model = LdaMulticore(corpus = corpus, num_topics=k, id2word=dictionary, random_state=100, passes=10, iterations=25, alpha=a, eta=b, chunksize=100)

    coherencemodel = CoherenceModel(model=model, corpus=corpus, texts=texts, dictionary=dictionary, coherence='u_mass')

    return coherencemodel.get_coherence()

In [9]:
dictionary_all = Dictionary(sents_all)
corpus_all = [dictionary_all.doc2bow(sent) for sent in sents_all]

In [10]:
import numpy as np
from tqdm import tqdm
# Topicos
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha 
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta 
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

num_of_docs = len(corpus_all)
model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# itera sobre topicos
for k in tqdm(topics_range):
    # itera sobre alpha
    for a in alpha:
        # itera sobre beta
        for b in beta:
            # obtem coerencia dados os parametros
            cv = compute_coherence_values(dictionary=dictionary_all, corpus=corpus_all, texts=sents_all, k=k, a=a, b=b)
            model_results['Topics'].append(k)
            model_results['Alpha'].append(a)
            model_results['Beta'].append(b)
            model_results['Coherence'].append(cv)

pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)

100%|███████████████████████████████████████████████████████████████████████████████████| 9/9 [30:41<00:00, 204.60s/it]


In [11]:
pd.read_csv('lda_tuning_results.csv').groupby(['Topics']).max()

Unnamed: 0_level_0,Alpha,Beta,Coherence
Topics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,symmetric,symmetric,-2.228871
3,symmetric,symmetric,-2.240554
4,symmetric,symmetric,-2.538752
5,symmetric,symmetric,-2.509785
6,symmetric,symmetric,-3.212903
7,symmetric,symmetric,-3.580729
8,symmetric,symmetric,-4.300689
9,symmetric,symmetric,-3.969757
10,symmetric,symmetric,-4.588944
