In [12]:
import pandas as pd
import nltk
from nltk import word_tokenize, download
from nltk.corpus import stopwords

from gensim import models
from gensim.corpora import Dictionary, MmCorpus

import pickle

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adrienawong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adrienawong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

In [4]:
def prep_corpus(docs, additional_stopwords=set(), no_below=1, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(docs)
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
    dictionary.compactify()

    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    return dictionary, corpus

### With the Kaggle dataset

In [18]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [19]:
def preprocess_wiki(comment):
    comment = word_tokenize(comment)  # Split into words.
    for c in comment:
        if '\n' in c:
            c = c.replace('\n', '')
    comment = [w for w in comment if not w in stop_words]  # Remove stopwords.
    comment = [w for w in comment if w.isalpha()]  # Remove numbers and punctuation.
    return comment

In [20]:
df['comment_text'] = df['comment_text'].fillna('fillna').str.lower()
df['text_tokens'] = df['comment_text'].apply(preprocess_wiki)
df['text_tokens']

0         [explanation, edits, made, username, hardcore,...
1         [matches, background, colour, seemingly, stuck...
2         [hey, man, really, trying, edit, war, guy, con...
3         [ca, make, real, suggestions, improvement, won...
4                       [sir, hero, chance, remember, page]
                                ...                        
159566    [second, time, asking, view, completely, contr...
159567          [ashamed, horrible, thing, put, talk, page]
159568    [spitzer, umm, theres, actual, article, prosti...
159569    [looks, like, actually, put, speedy, first, ve...
159570    [really, think, understand, came, idea, bad, r...
Name: text_tokens, Length: 159571, dtype: object

In [21]:
dictionary_wiki, corpus_wiki = prep_corpus(df['text_tokens'])
lda_wiki = models.ldamodel.LdaModel(corpus=corpus_wiki, id2word=dictionary_wiki, num_topics=5, passes=50)

Building dictionary...
Building corpus...


In [22]:
file = open('lda_wiki.pkl', 'wb')
pickle.dump(lda_wiki, file)
file.close()

file = open('dictionary_wiki.pkl', 'wb')
pickle.dump(dictionary_wiki, file)
file.close()

In [23]:
# Reading dictionary and model files:

# with open('lda_wiki.pkl', 'rb') as f:
#     lda_wiki = pickle.load(f)
# with open('dictionary_wiki.pkl', 'rb') as f:
#     dictionary_wiki = pickle.load(f)

In [24]:
mm = [dictionary_wiki.doc2bow(text) for text in df['text_tokens']]
topics = pd.DataFrame(dict(lda_wiki[x]) for x in mm)
df['topics'] = topics.values.tolist()

In [25]:
#Write to CSV
print('Writing to CSV')
df.to_csv("topic_modelling_wiki.csv")

Writing to CSV


In [26]:
lda_wiki.show_topics(formatted=False)

[(0,
  [('http', 0.0063881697),
   ('also', 0.0063691023),
   ('one', 0.006343619),
   ('article', 0.0055740587),
   ('name', 0.004725555),
   ('new', 0.004707772),
   ('first', 0.004024193),
   ('list', 0.003901345),
   ('would', 0.003892216),
   ('used', 0.003563209)]),
 (1,
  [('article', 0.014565899),
   ('would', 0.009152092),
   ('one', 0.007617213),
   ('page', 0.0074247657),
   ('like', 0.0072036837),
   ('think', 0.0069368244),
   ('wikipedia', 0.0063149254),
   ('know', 0.006283714),
   ('talk', 0.0057841027),
   ('see', 0.005445318)]),
 (2,
  [('fuck', 0.030818222),
   ('lol', 0.026561284),
   ('fat', 0.014609777),
   ('go', 0.014600924),
   ('hate', 0.013998529),
   ('faggot', 0.013663317),
   ('hey', 0.013011995),
   ('fucking', 0.012422295),
   ('u', 0.01149223),
   ('dont', 0.011053541)]),
 (3,
  [('people', 0.008179867),
   ('aids', 0.007787954),
   ('jew', 0.0059157857),
   ('freedom', 0.004266559),
   ('us', 0.0041607697),
   ('war', 0.0040959264),
   ('state', 0.0040

### With the Twitter dataset

In [5]:
twitter_df = pd.read_csv('../data/sample_with_label.csv')
twitter_df.head()

Unnamed: 0.1,Unnamed: 0,TweetID,UserName,TweetText,TweetDateTime,Followers,UserID,WasDeleted
0,0,1244721273793646594,ShannePanne,b'Rhoa',2020-03-30 20:20:52,287,990004076829200385,0
1,1,1244721275232301058,25_ShadesOfK,b'@HisTemp_TAYtion Lol I learned from that shi...,2020-03-30 20:20:52,1189,624244930,0
2,2,1244721275936878593,PlagueJesterSky,b'Trying to set up the stream and I keep getti...,2020-03-30 20:20:52,415,755613447702847488,0
3,3,1244721278650650624,spitbull1963,b'@EricksonReal @Ilhan And you can test as muc...,2020-03-30 20:20:53,16,47425986,0
4,4,1244721278931664896,kamanfrancis,b'Me taking notes for the future when I\xe2\x8...,2020-03-30 20:20:53,1928,269295980,0


In [6]:
def preprocess_twitter(comment):
    comment = word_tokenize(comment)  # Split into words.
    for c in comment:
        if '\'b' in c:
            c = c.replace('\'b', '')
        if '\\x' in c:
            c = c.replace('\\x', ' \\x')
    comment = [w for w in comment if not w in stop_words]  # Remove stopwords.
    comment = [w for w in comment if w.isalpha()]  # Remove numbers and punctuation.
    return comment

In [8]:
twitter_df['TweetText'] = twitter_df['TweetText'].fillna('fillna').str.lower()
twitter_df['text_tokens'] = twitter_df['TweetText'].apply(preprocess_twitter)
twitter_df['text_tokens']

0                                                       []
1        [b, lol, learned, shit, putting, eggs, one, ba...
2        [set, stream, keep, getting, error, code, come...
3        [b, ericksonreal, ilhan, test, much, like, fuc...
4        [taking, notes, future, rush, got, ta, daughte...
                               ...                        
29444    [b, mertipekcix, bu, realde, sadece, fuck, ne,...
29445    [girls, onlyfans, making, bread, rn, world, come]
29446    [b, forlornjunkheap, brujahistorica, serena, s...
29447     [b, whos, favorite, animal, crossing, character]
29448    [b, torqueaboutit, nothing, go, absolutely, no...
Name: text_tokens, Length: 29449, dtype: object

In [9]:
dictionary_twitter, corpus_twitter = prep_corpus(twitter_df['text_tokens'])
lda_twitter = models.ldamodel.LdaModel(corpus=corpus_twitter, id2word=dictionary_twitter, num_topics=5, passes=50)

Building dictionary...
Building corpus...


In [13]:
file = open('lda_twitter.pkl', 'wb')
pickle.dump(lda_twitter, file)
file.close()

file = open('dictionary_twitter.pkl', 'wb')
pickle.dump(dictionary_twitter, file)
file.close()

In [14]:
# Reading dictionary and model files:

# with open('lda_twitter.pkl', 'rb') as f:
#     lda_twitter = pickle.load(f)
# with open('dictionary_twitter.pkl', 'rb') as f:
#     dictionary_twitter = pickle.load(f)

In [15]:
mm = [dictionary_twitter.doc2bow(text) for text in twitter_df['text_tokens']]
topics = pd.DataFrame(dict(lda_twitter[x]) for x in mm)
twitter_df['topics'] = topics.values.tolist()

In [16]:
#Write to CSV
print('Writing to CSV')
twitter_df.to_csv("topic_modelling_twitter.csv")

Writing to CSV


In [17]:
lda_twitter.show_topics(formatted=False)

[(0,
  [('cunt', 0.0298522),
   ('pay', 0.008277647),
   ('sorry', 0.006177196),
   ('called', 0.0049548536),
   ('kill', 0.0047159223),
   ('parents', 0.0038456004),
   ('birthday', 0.0028735774),
   ('three', 0.0028130827),
   ('wait', 0.002510068),
   ('video', 0.002494603)]),
 (1,
  [('fuck', 0.08043622),
   ('like', 0.019773507),
   ('people', 0.012822449),
   ('shit', 0.010465467),
   ('get', 0.010316776),
   ('fucking', 0.009629459),
   ('know', 0.0091151),
   ('go', 0.008082516),
   ('would', 0.007985728),
   ('bitch', 0.0075952075)]),
 (2,
  [('bread', 0.06674111),
   ('hands', 0.020240443),
   ('https', 0.01815345),
   ('wash', 0.0177393),
   ('stay', 0.011291202),
   ('home', 0.00843354),
   ('make', 0.008258244),
   ('banana', 0.007285472),
   ('amp', 0.0070819524),
   ('get', 0.0061140424)]),
 (3,
  [('crossing', 0.1228715),
   ('animal', 0.11893973),
   ('https', 0.023913952),
   ('play', 0.014366834),
   ('new', 0.0121812085),
   ('switch', 0.0116302),
   ('island', 0.01