# Topic Modeling - OLID Training (OFF)

## Data Cleaning 
- Tokenization
- Remove Stop Words
- Remove special characters - "?", "." , ".", "!", "*", ";", ":", "-"
- Lemmatization

In [1]:
# Importing the required libraries
import spacy
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wordnet
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/diptanu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/diptanu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Tokenize the tweets using Spacy English Tokenizer
import re
def tokenize(text, parser=English()):
    word_tokens_list = []
    tokens = parser(text)
    for token in tokens:
        # print(str(token.orth_).)
        if token.orth_.startswith('@') or token.orth_.isspace():
            continue
        elif token.like_url:
            word_tokens_list.append('URL')
        else:
            word_tokens_list.append(token.lower_)
    return word_tokens_list

In [3]:
# Remove the stopwords
def english_stop_words():
    stop_words_set = set(nltk.corpus.stopwords.words('english'))
    # Add common punctuations to remove
    stop_words_set.update(["?", "." , ".", "!", "*", ";", ":", "-"])
    return stop_words_set

In [4]:
# Lemmatize the text
def lemmatize(word):
    lemma = wordnet.morphy(word)
    if lemma is None:
        return word
    return lemma

In [5]:
# Calling all the methods to clean the data
def clean_data(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if token not in english_stop_words()]
    tokens = [lemmatize(token) for token in tokens]
    return tokens

### Import the Dataset and Apply cleaning steps

In [6]:
# Apply data cleaning steps on OLID - Training Dataset
import csv as tsv
text_data = []
with open('olid-training-v1.0.tsv') as input_file:
    tsv_reader = tsv.reader(input_file, delimiter='\t')
    for row in tsv_reader:
        if (row[2] == 'OFF'):
            tokens = clean_data(row[1])
            text_data.append(tokens)

## Topic Modeling on the dataset

In [9]:
# Importing the required libraries
import gensim
from gensim import corpora

# Number of topics
num_of_topics = 10

In [10]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [11]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_of_topics, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.069*"gun" + 0.055*"control" + 0.012*"(" + 0.012*")" + 0.012*"ass" + 0.011*"n’t" + 0.010*"url" + 0.009*"laws" + 0.009*"people" + 0.007*"human"')
(1, '0.031*"🤣" + 0.018*"liberal" + 0.012*"fake" + 0.009*"medium" + 0.008*"beto" + 0.008*"would" + 0.008*"freak" + 0.007*"cnn" + 0.007*"bottom" + 0.006*"blame"')
(2, '0.202*"#" + 0.042*"maga" + 0.036*"url" + 0.007*"\'s" + 0.006*"liberal" + 0.006*"..." + 0.006*"conservative" + 0.005*"walkaway" + 0.005*"need" + 0.005*"democrat"')
(3, '0.022*"liberal" + 0.017*"\'s" + 0.017*"trump" + 0.011*"disgust" + 0.010*"president" + 0.009*"woman" + 0.009*"conservative" + 0.008*"n\'t" + 0.007*"url" + 0.007*"democrat"')
(4, '0.042*"shit" + 0.026*"fuck" + 0.025*"..." + 0.025*"😂" + 0.014*"n’t" + 0.012*".." + 0.012*"like" + 0.010*"...." + 0.009*"lol" + 0.008*"️"')
(5, '0.025*"n\'t" + 0.024*"liberal" + 0.020*"people" + 0.016*"\'s" + 0.013*"like" + 0.012*"’s" + 0.011*"antifa" + 0.010*"n’t" + 0.010*"one" + 0.010*"white"')
(6, '0.028*"&" + 0.028*"amp" + 0.014*"ge

### Visualize the topics and words

In [12]:
# Visualize the topics
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)