# Topic Modelling using Biterm Model

In [1]:
import eland as ed
from eland.conftest import *
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models
from wordcloud import WordCloud
from biterm.utility import vec_to_biterms
from biterm.btm import oBTM
import gensim
import matplotlib.pyplot as plt
import pyLDAvis.gensim
import pickle 
import pyLDAvis
from pprint import pprint
import re
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

## Importing data from Elasticsearch

In [2]:
ed_df = ed.DataFrame('localhost', 'twitter', columns=['full_text_processed', 'user_id', 'verified', 'name', 'location', 'entities.hashtags.text', 'entities.user_mentions.name'])

# defining the full-text query we need: Retrieving records for full_text_processed with the condition is_retweet=False and is_quote_status=False
query_unique = {
    "bool": {
        "must": {
            "term":{"is_retweet":"false"},
        },
        "filter": {
            "term":{"is_quote_status":"false"}
        },
    }
}
# using full-text search capabilities with Eland:
df_ed = ed_df.es_query(query_unique)
df_tweets = df_ed.to_pandas()

In [3]:
df_tweets.head()

Unnamed: 0,full_text_processed,user_id,verified,name,location,entities.hashtags.text,entities.user_mentions.name
1264160647002103808,praying everyone affected condolence family vi...,1256622599364214786,False,The Meraaki,"Ahmadabad City, India",AmphanSuperCyclone,
1264160609668599808,cyclone ampan people satkhira upset due lack w...,1251934220345208832,False,Newspapers,Dhaka,,
1264121161589415936,cyclone amphan ha completely destroyed agricul...,1251934220345208832,False,Newspapers,Dhaka,,
1264160569315209216,amphan cyclone ​​cm mamta demand ban labor spe...,1113075640499036160,False,netvani,,,
1264114187346874368,amfan storm caused devastation bengal mp nusra...,1113075640499036160,False,netvani,,,


### Tokenising and removing short tweets (less than 4 words)

In [4]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [5]:
df_tweets['full_text_processed'] = df_tweets['full_text_processed'].apply(lambda x: remove_emoji(x))
df_tweets['full_text_tokens'] = df_tweets['full_text_processed'].apply(lambda x: [w for w in x.split()])
df_tweets['length'] = df_tweets['full_text_tokens'].apply(lambda x: len(x))
df_tweets = df_tweets[df_tweets['length']>4]

In [6]:
stop_words = stopwords.words('english')
stop_words.extend(['from','not', 'would', 'say', 'could', '_', 'be', 'go', 'do', 'rather', 'seem', 'due', 'via', 'done', 'said'])

tweets = df_tweets.full_text_processed.to_list()


## Building the Vectorizer

In [7]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(2,4), stop_words=stop_words, min_df=5, token_pattern='(?u)\\b[^\\s|\\.]+\\b')

In [8]:
# Create the TF-IDF matrix
tweets_tfidf = vectorizer.fit_transform(tweets)
tweets_tfidf.shape

(95337, 79597)

In [9]:
# Tokenize the documents according to the parameters in the vectorizer
for i in range(len(tweets)):
    tweets[i] = vectorizer.build_analyzer()(tweets[i])

In [10]:
# Create a dictionary and corpus for Gensim to compute coherence metrics
dictionary=corpora.Dictionary(tweets)
corpus=[dictionary.doc2bow(doc) for doc in tweets]
# Adding the TF-IDF for better insight 
tfidf = gensim.models.TfidfModel(corpus)
tfidf_corpus = tfidf[corpus]

## Get the vocabulary and the biterms from the tweets

In [10]:
vocab = np.array(vectorizer.get_feature_names())
biterms = vec_to_biterms(tweets_tfidf)

In [12]:
# create btm
btm = oBTM(num_topics=10, V=vocab)

In [None]:
print("\n\n Train BTM ..")
for i in range(0, len(biterms), 10000): # process chunkwise
    biterms_chunk = biterms[i:i + 10000]
    btm.fit(biterms_chunk, iterations=1)
    topics = btm.transform(biterms)