# Topic Modeling - OLID Training (OFF)

## Data Cleaning 
- Tokenization
- Remove Stop Words
- Remove special characters - "?", "." , ".", "!", "*", ";", ":", "-"
- Lemmatization

In [146]:
# Importing the required libraries
import spacy
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wordnet
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/diptanu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/diptanu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [147]:
# Tokenize the tweets using Spacy English Tokenizer
import re
def tokenize(text, parser=English()):
    word_tokens_list = []
    tokens = parser(text)
    for token in tokens:
        # print(str(token.orth_).)
        if token.orth_.startswith('@') or token.orth_.isspace():
            continue
        elif token.like_url:
            word_tokens_list.append('URL')
        else:
            word_tokens_list.append(token.lower_)
    return word_tokens_list

In [148]:
# Remove the stopwords
def english_stop_words():
    stop_words_set = set(nltk.corpus.stopwords.words('english'))
    # Add common punctuations to remove
    stop_words_set.update(["?", "." , ".", "!", "*", ";", ":", "-"])
    return stop_words_set

In [149]:
# Lemmatize the text
def lemmatize(word):
    lemma = wordnet.morphy(word)
    if lemma is None:
        return word
    return lemma

In [150]:
# Calling all the methods to clean the data
def clean_data(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if token not in english_stop_words()]
    tokens = [lemmatize(token) for token in tokens]
    return tokens

### Import the Dataset and Apply cleaning steps

In [151]:
# Apply data cleaning steps on OLID - Training Dataset
import csv as tsv
text_data = []
with open('olid-training-v1.0.tsv') as input_file:
    tsv_reader = tsv.reader(input_file, delimiter='\t')
    for row in tsv_reader:
        if (row[2] == 'OFF'):
            tokens = clean_data(row[1])
            text_data.append(tokens)

## Topic Modeling on the dataset

In [159]:
# Importing the required libraries
import gensim
from gensim import corpora

# Number of topics
num_of_topics = 10

In [162]:
len(text_data)

4400

In [160]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [161]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_of_topics, id2word=dictionary, passes=5)
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.029*"😂" + 0.016*"shit" + 0.013*"people" + 0.010*"ass" + 0.009*"human" + 0.009*"️" + 0.008*"get" + 0.008*"’s" + 0.007*"nigga" + 0.007*"liberal"')
(1, '0.108*""" + 0.059*"," + 0.012*"liberal" + 0.012*"🇺" + 0.011*"🇸" + 0.010*"\'s" + 0.007*"sick" + 0.007*"go" + 0.007*"trump" + 0.007*"conservative"')
(2, '0.057*"gun" + 0.045*"control" + 0.015*"fuck" + 0.012*"n’t" + 0.011*"’s" + 0.010*"url" + 0.009*"like" + 0.008*"laws" + 0.008*"bitch" + 0.007*"law"')
(3, '0.030*"..." + 0.021*"\'s" + 0.018*"n\'t" + 0.014*"like" + 0.013*"one" + 0.012*"liberal" + 0.012*"people" + 0.011*"antifa" + 0.011*".." + 0.010*"say"')
(4, '0.032*"liberal" + 0.014*"good" + 0.014*"n\'t" + 0.010*"\'s" + 0.010*"think" + 0.009*"like" + 0.008*"really" + 0.008*"way" + 0.008*"ass" + 0.007*"call"')
(5, '0.023*"”" + 0.023*"“" + 0.018*"’s" + 0.014*"liberal" + 0.012*"vote" + 0.010*"pay" + 0.009*"money" + 0.008*"shit" + 0.007*"time" + 0.007*"get"')
(6, '0.023*"&" + 0.023*"amp" + 0.019*"n’t" + 0.014*"know" + 0.010*"’s" + 0.010*"

In [163]:
# Visualize the topics
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)