In [1]:
# Dataset preprocessing

import json
import pandas as pd 
import numpy as np

# read json file as dataframe
parler_df_1 = pd.read_json('D:\\bachelors_thesis\Datasets\parler_data\parler_data000000000001.ndjson', lines = True) 

In [2]:
print(parler_df_1.columns)

Index(['comments', 'body', 'bodywithurls', 'createdAt', 'createdAtformatted',
       'creator', 'datatype', 'depth', 'depthRaw', 'followers', 'following',
       'hashtags', 'id', 'lastseents', 'links', 'media', 'posts', 'sensitive',
       'shareLink', 'upvotes', 'urls', 'username', 'verified', 'article',
       'impressions', 'preview', 'reposts', 'state', 'parent', 'color',
       'commentDepth', 'controversy', 'downvotes', 'post', 'score',
       'isPrimary', 'conversation', 'replyingTo'],
      dtype='object')


In [3]:
# remove columns  

print(parler_df_1.columns)
print('Dimension of whole dataframe: ' + str(parler_df_1.shape)) # df.shape -> (rows, columns)

# final_df_1 = pd.DataFrame()
# final_df_1['body'] = df['body'].copy()
# final_df_1['createdAtformatted'] = df['createdAtformatted'].copy()
parler_df_1.drop(parler_df_1.iloc[:, 2:38], inplace = True, axis = 1) # remove all columns between column index 2 to 38
parler_df_1.drop(['comments'], inplace = True, axis = 1) # remove first column

print(parler_df_1.columns)
print('Dimension of final dataframe: ' + str(parler_df_1.shape))

Index(['comments', 'body', 'bodywithurls', 'createdAt', 'createdAtformatted',
       'creator', 'datatype', 'depth', 'depthRaw', 'followers', 'following',
       'hashtags', 'id', 'lastseents', 'links', 'media', 'posts', 'sensitive',
       'shareLink', 'upvotes', 'urls', 'username', 'verified', 'article',
       'impressions', 'preview', 'reposts', 'state', 'parent', 'color',
       'commentDepth', 'controversy', 'downvotes', 'post', 'score',
       'isPrimary', 'conversation', 'replyingTo'],
      dtype='object')
Dimension of whole dataframe: (1095287, 38)
Index(['body'], dtype='object')
Dimension of final dataframe: (1095287, 1)


In [4]:
# filtering out null values

print('Dimension of final whole dataframe: ' + str(parler_df_1.shape)) 
print(parler_df_1['body'])

parler_df_1['body'].replace("", np.nan, inplace=True)
parler_df_1.dropna(subset=['body'], inplace=True)

print('Dimension of final dataframe after preprocessing: ' + str(parler_df_1.shape)) 

print(parler_df_1['body'])

Dimension of final whole dataframe: (1095287, 1)
0          ‪ CHILLS! \n\nI loved hearing Airforce Technic...
1                                                           
2          Justice Department Zeroes In On Cuomo's COVID ...
3          Interesting how ultra-liberal, hysterical, Hol...
4          Petraeus Says Trump May Have Restored U.S. 'De...
                                 ...                        
1095282    I just shared this to "Chrissy" Wallace's FB m...
1095283                           What have you got to lose?
1095284                                                     
1095285    So... are we gonna talk about the fact that on...
1095286                                                     
Name: body, Length: 1095287, dtype: object
Dimension of final dataframe after preprocessing: (636482, 1)
0          ‪ CHILLS! \n\nI loved hearing Airforce Technic...
2          Justice Department Zeroes In On Cuomo's COVID ...
3          Interesting how ultra-liberal, hysterical,

In [5]:
# 2. Prerequisites – Download nltk stopwords and spacy model

# Run in python console
import nltk; nltk.download('stopwords')

# Run in terminal or command prompt
# python3 -m spacy download en

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adela\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# 3. Import Packages

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
# import pyLDAvis.gensim  # don't skip this
import pyLDAvis.gensim # !!! name changed from gensim to gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [7]:
# 5. Prepare Stopwords

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [11]:
# 7. Remove emails and newline characters

# Convert to list
data = parler_df_1.body.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['\u202a CHILLS! I loved hearing Airforce Technical SGT. Nalani Quintello '
 'singing the National Anthem in front of President Trump & First Lady, '
 'Melania Trump & all the fans at #DAYTONA500 A huge shout out to the amazing '
 'Thunderbirds for the fly over! This is the America I Love🇺🇸 ECHO if you feel '
 'the same! \u202c']


In [12]:
# 8. Tokenize words and Clean-up text

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['chills', 'loved', 'hearing', 'airforce', 'technical', 'sgt', 'nalani', 'quintello', 'singing', 'the', 'national', 'anthem', 'in', 'front', 'of', 'president', 'trump', 'first', 'lady', 'melania', 'trump', 'all', 'the', 'fans', 'at', 'daytona', 'huge', 'shout', 'out', 'to', 'the', 'amazing', 'thunderbirds', 'for', 'the', 'fly', 'over', 'this', 'is', 'the', 'america', 'love', 'echo', 'if', 'you', 'feel', 'the', 'same']]


In [13]:
# 9. Creating Bigram and Trigram Models

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['chills', 'loved', 'hearing', 'airforce', 'technical', 'sgt', 'nalani', 'quintello', 'singing', 'the', 'national_anthem', 'in', 'front', 'of', 'president', 'trump', 'first', 'lady_melania', 'trump', 'all', 'the', 'fans', 'at', 'daytona', 'huge', 'shout', 'out', 'to', 'the', 'amazing', 'thunderbirds', 'for', 'the', 'fly', 'over', 'this', 'is', 'the', 'america', 'love', 'echo', 'if', 'you', 'feel', 'the', 'same']


In [14]:
# 10. Remove Stopwords, Make Bigrams and Lemmatize

# Define functions for stopwords, bigrams, trigrams and lemmatization

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en 
# !!! As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full pipeline package name 'en_core_web_sm' instead.
# nlp = spacy.load('en', disable=['parser', 'ner'])
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['chill', 'love', 'hearing', 'airforce', 'technical', 'nalani', 'quintello', 'singe', 'front', 'trump', 'fan', 'daytona', 'huge', 'shout', 'amazing', 'thunderbird', 'fly', 'feel']]


In [15]:
# 11. Create the Dictionary and Corpus needed for Topic Modeling

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

id2word[0]

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]]


[[('airforce', 1),
  ('amazing', 1),
  ('chill', 1),
  ('daytona', 1),
  ('fan', 1),
  ('feel', 1),
  ('fly', 1),
  ('front', 1),
  ('hearing', 1),
  ('huge', 1),
  ('love', 1),
  ('nalani', 1),
  ('quintello', 1),
  ('shout', 1),
  ('singe', 1),
  ('technical', 1),
  ('thunderbird', 1),
  ('trump', 1)]]

In [17]:
from numba import jit

In [19]:
# 12. Building the Topic Model
# https://radimrehurek.com/gensim/models/ldamodel.html

# Build LDA model
@jit
def LDA():
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [20]:
LDA()

Compilation is falling back to object mode WITH looplifting enabled because Function "LDA" failed type inference due to: [1mUntyped global name 'id2word':[0m [1m[1mCannot determine Numba type of <class 'gensim.corpora.dictionary.Dictionary'>[0m
[1m
File "..\..\Users\adela\AppData\Local\Temp\ipykernel_23204\465458224.py", line 8:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m[0m
  @jit
[1m
File "..\..\Users\adela\AppData\Local\Temp\ipykernel_23204\465458224.py", line 5:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
[1m
File "..\..\Users\adela\AppData\Local\Temp\ipykernel_23204\465458224.py", line 5:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m


In [None]:
# 13. View the topics in LDA model

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# 14. Compute Model Perplexity and Coherence Score

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# 15. Visualize the topics-keywords

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word) # gensim_models instead od gensim https://github.com/bmabey/pyLDAvis/issues/131
vis

In [8]:
# Essentials
import base64
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datapane as dp
#dp.login(token='INSERT_TOKEN_HERE')
# Gensim and LDA
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
# NLP stuff
import contractions
import demoji
import string
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
nltk.download('wordnet')
import spacy
# Plotting tools
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
%matplotlib inline
# Miscellaneous
from sklearn.manifold import TSNE
from pprint import pprint

def preprocess(text_col):
    """This function will apply NLP preprocessing lambda functions over a pandas series such as df['text'].
       These functions include converting text to lowercase, removing emojis, expanding contractions, removing punctuation,
       removing numbers, removing stopwords, lemmatization, etc."""
    
    # convert to lowercase
    text_col = text_col.apply(lambda x: ' '.join([w.lower() for w in x.split()]))
    
    # remove emojis
    text_col = text_col.apply(lambda x: demoji.replace(x, ""))
    
    # expand contractions  
    text_col = text_col.apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))

    # remove punctuation
    text_col = text_col.apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))
    
    # remove numbers
    text_col = text_col.apply(lambda x: ' '.join(re.sub("[^a-zA-Z]+", " ", x).split()))

    # remove stopwords
    stopwords = [sw for sw in nltk.corpus.stopwords.words('english') if sw not in ['not', 'no']]
    text_col = text_col.apply(lambda x: ' '.join([w for w in x.split() if w not in stopwords]))

    # lemmatization
    text_col = text_col.apply(lambda x: ' '.join([WordNetLemmatizer().lemmatize(w) for w in x.split()]))

    # remove short words
    text_col = text_col.apply(lambda x: ' '.join([w.strip() for w in x.split() if len(w.strip()) >= 3]))

    return text_col

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adela\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
print(parler_df_1['body'])

preprocess(parler_df_1['body'])

print(parler_df_1['body'])

0          ‪ CHILLS! \n\nI loved hearing Airforce Technic...
2          Justice Department Zeroes In On Cuomo's COVID ...
3          Interesting how ultra-liberal, hysterical, Hol...
4          Petraeus Says Trump May Have Restored U.S. 'De...
5          Hillary Clinton Calls Bernie Sanders a Sore Lo...
                                 ...                        
1095275    SHE WANTS ANOTHER BENGHAZI and TO MURDER MORE ...
1095281                       🇺🇸❤️🇺🇸❤️🙏🏻 #trump2020landslide
1095282    I just shared this to "Chrissy" Wallace's FB m...
1095283                           What have you got to lose?
1095285    So... are we gonna talk about the fact that on...
Name: body, Length: 636482, dtype: object


KeyboardInterrupt: 