# Topic Modeling - OLID Training (OFF)

## Data Cleaning 
- Tokenization
- Remove Stop Words
- Remove special characters - "?", "." , ".", "!", "*", ";", ":", "-"
- Lemmatization

In [146]:
# Importing the required libraries
import spacy
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wordnet
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/diptanu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/diptanu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [147]:
# Tokenize the tweets using Spacy English Tokenizer
import re
def tokenize(text, parser=English()):
    word_tokens_list = []
    tokens = parser(text)
    for token in tokens:
        # print(str(token.orth_).)
        if token.orth_.startswith('@') or token.orth_.isspace():
            continue
        elif token.like_url:
            word_tokens_list.append('URL')
        else:
            word_tokens_list.append(token.lower_)
    return word_tokens_list

In [148]:
# Remove the stopwords
def english_stop_words():
    stop_words_set = set(nltk.corpus.stopwords.words('english'))
    # Add common punctuations to remove
    stop_words_set.update(["?", "." , ".", "!", "*", ";", ":", "-"])
    return stop_words_set

In [149]:
# Lemmatize the text
def lemmatize(word):
    lemma = wordnet.morphy(word)
    if lemma is None:
        return word
    return lemma

In [150]:
# Calling all the methods to clean the data
def clean_data(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if token not in english_stop_words()]
    tokens = [lemmatize(token) for token in tokens]
    return tokens

### Import the Dataset and Apply cleaning steps

In [151]:
# Apply data cleaning steps on OLID - Training Dataset
import csv as tsv
text_data = []
with open('olid-training-v1.0.tsv') as input_file:
    tsv_reader = tsv.reader(input_file, delimiter='\t')
    for row in tsv_reader:
        if (row[2] == 'OFF'):
            tokens = clean_data(row[1])
            text_data.append(tokens)

## Topic Modeling on the dataset

In [159]:
# Importing the required libraries
import gensim
from gensim import corpora

# Number of topics
num_of_topics = 10

In [162]:
len(text_data)

4400

In [160]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [164]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_of_topics, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

  and should_run_async(code)


(0, '0.049*"&" + 0.045*"amp" + 0.024*"ass" + 0.020*"bitch" + 0.007*"\'s" + 0.007*"even" + 0.006*"right" + 0.006*"joe" + 0.006*"go" + 0.006*"could"')
(1, '0.016*"..." + 0.016*"\'s" + 0.014*"’s" + 0.014*"antifa" + 0.013*"like" + 0.012*"fuck" + 0.012*"n\'t" + 0.012*"fucking" + 0.011*"n’t" + 0.011*"url"')
(2, '0.036*"liberal" + 0.021*"n\'t" + 0.019*".." + 0.014*"get" + 0.010*"sick" + 0.009*"trump" + 0.008*"suck" + 0.008*"president" + 0.007*"shit" + 0.007*"hear"')
(3, '0.018*"..." + 0.011*"people" + 0.010*"would" + 0.009*"gun" + 0.008*"’s" + 0.008*"control" + 0.008*"️" + 0.007*"u" + 0.007*"idiot" + 0.007*"("')
(4, '0.091*""" + 0.048*"," + 0.020*"😂" + 0.019*"\'s" + 0.015*"conservative" + 0.015*"n\'t" + 0.014*"liberal" + 0.009*"use" + 0.009*"\'" + 0.008*"woman"')
(5, '0.026*"know" + 0.020*"liberal" + 0.011*"shit" + 0.009*"people" + 0.009*"n’t" + 0.009*"think" + 0.008*"nigga" + 0.008*"disgust" + 0.008*"dumb" + 0.007*"full"')
(6, '0.046*"gun" + 0.036*"control" + 0.011*"liberal" + 0.009*"url" + 

In [165]:
# Visualize the topics
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  and should_run_async(code)
