In [98]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import re, random, os
import string, pprint
import spacy

In [99]:
# Importing gensim module for Text analysis
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [100]:
# This is to visualise LDA (latent dirichlet allocation) which will divide into topics i.e Topic Modelling
import pyLDAvis
import pyLDAvis.gensim

In [101]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [103]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [104]:
#Reading the tweets file which we have downloaded
tweet=pd.read_csv('BurganBankGroup_tweets.csv',encoding='ASCII')

In [106]:
tweet.head()

Unnamed: 0,id,created_at,text
0,1294358298498564096,2020-08-14 19:40:41,b'#BurganBank\n#BurganTips https://t.co/2ELGNX...
1,1294203395507658752,2020-08-14 09:25:09,b'#BurganBank\n#BurganTips\n#CoronaTips https:...
2,1293977600713515015,2020-08-13 18:27:55,b'\xd9\x88\xd8\xb6\xd8\xb9 \xd9\x8a\xd9\x88\xd...
3,1293908033492226054,2020-08-13 13:51:29,b'\xd8\xa8\xd8\xae\xd8\xa7\xd8\xb7\xd8\xb1\xd9...
4,1293846145853263872,2020-08-13 09:45:34,b'#BurganBank\n#BurganTips\n#CoronaTips https:...


In [107]:
len(tweet.index)

3243

In [108]:
tweet.text[1]

"b'#BurganBank\\n#BurganTips\\n#CoronaTips https://t.co/YnHGRbWZG5'"

In [109]:
# removing the first two letters (i.e ",b) and ending letter(i.e ")
tweet.text=tweet.text.apply(lambda x : x[2:])
tweet.text=tweet.text.apply(lambda x : x[:-1])

In [110]:
tweet.text[1]

'#BurganBank\\n#BurganTips\\n#CoronaTips https://t.co/YnHGRbWZG5'

In [111]:
def replace_str(i,j):
    tweet.text=tweet.text.apply(lambda x : x.replace(i,j))

In [112]:
def replaced_nc(p):
    for k in p:
        replace_str(k,' ')
                
def replaced(l):
    for k in l:
        replace_str(k," ")


In [113]:
lst=['\n',
'\x80',
'\x81',
'\x82',
'\x83',
'\x84',
'\x85',
'\x86',
'\x87',
'\x88',
'\x89',
'\xa0',
'\xa1',
'\xa2',
'\xa3',
'\xa4',
'\xa5',
'\xa6',
'\xa7',
'\xa8',
'\xa9',
'\xab',
'\xaa',
'\xac',
'\xad',
'\xae',
'\xaf',
'\xb2',
'\xb1',
'\xb3',
'\xb4',
'\xb6',
'\xb5',
'\xb7',
'\xb9',
'\xd8',
'\xe2',
'\\n',
'\\x80',
'\\x81',
'\\x82',
'\\x83',
'\\x84',
'\\x85',
'\\x86',
'\\x87',
'\\x88',
'\\x89',
'\\xa0',
'\\xa1',
'\\xa2',
'\\xa3',
'\\xa4',
'\\xa5',
'\\xa6',
'\\xa7',
'\\xa8',
'\\xa9',
'\\xab',
'\\xaa',
'\\xac',
'\\xad',
'\\xae',
'\\xaf',
'\\xb2',
'\\xb1',
'\\xb3',
'\\xb4',
'\\xb6',
'\\xb5',
'\\xb7',
'\\xb9',
'\\xd8',
'\\xe2',
'\\xd9',
'\\x8a']

In [114]:
nc=['-','_','@','#','\n','\t','*','!','$','%','(',')','[',']','{','}',',','|','http','https','co']

In [115]:
replaced(lst)
replaced_nc(nc)

In [116]:
tweet.text.head()

0           BurganBank  BurganTips  s://t. /2ELGNXmpoN
1     BurganBank  BurganTips  CoronaTips  s://t. /Y...
2                                                     
3                                \x9f              ...
4     BurganBank  BurganTips  CoronaTips  s://t. /2...
Name: text, dtype: object

In [117]:
data = tweet.text.tolist()

In [118]:
data[:5]

[' BurganBank  BurganTips  s://t. /2ELGNXmpoN',
 ' BurganBank  BurganTips  CoronaTips  s://t. /YnHGRbWZG5',
 '                              ',
 '                            \\x9f                                                  \\xb0                  \\x8b                                                                     .   BurganBank     s://t. /NjUHdWHArz',
 ' BurganBank  BurganTips  CoronaTips  s://t. /28tSgXdKzK']

**This function will do the following**
- Convert a document into a list of tokens.
- lowercases, tokenizes, de-accents (optional)

In [119]:
def sent_to_words(sentences, deacc=True): # deacc=True removes punctuations
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))

In [120]:
data_words = list(sent_to_words(data))
print(data_words[3])

['xb', 'burganbank', 'njuhdwharz']


In [121]:
stop_words = stopwords.words('english') + list(string.punctuation)

In [122]:
#This will remove the stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
data_words_nostops = remove_stopwords(data_words)

In [123]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [124]:
nlp = spacy.load('en', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_nostops)

In [125]:
print(data_lemmatized[0])

['burganbank', 'burgantip', 'elgnxmpon']


***Converting to Dictionary***

In [126]:
id2word = corpora.Dictionary(data_lemmatized)

In [130]:
print(id2word)

Dictionary(5157 unique tokens: ['burganbank', 'burgantip', 'elgnxmpon', 'coronatip', 'ynhgrbwzg']...)


In [138]:
corpus = [id2word.doc2bow(text) for text in data_lemmatized]
print(corpus[0:8])

[[(0, 1), (1, 1), (2, 1)], [(0, 1), (1, 1), (3, 1), (4, 1)], [], [(0, 1), (5, 1), (6, 1)], [(0, 1), (1, 1), (3, 1), (7, 1)], [(0, 1), (8, 1), (9, 1), (10, 1)], [(6, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]]


In [139]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:8]]

[[('burganbank', 1), ('burgantip', 1), ('elgnxmpon', 1)],
 [('burganbank', 1), ('burgantip', 1), ('coronatip', 1), ('ynhgrbwzg', 1)],
 [],
 [('burganbank', 1), ('njuhdwharz', 1), ('xb', 1)],
 [('burganbank', 1), ('burgantip', 1), ('coronatip', 1), ('tsgxdkzk', 1)],
 [('burganbank', 1), ('burganservice', 1), ('vmlmp', 1), ('wvr', 1)],
 [('xb', 1),
  ('benefit', 1),
  ('offer', 1),
  ('pwrcd', 1),
  ('qiqduxqm', 1),
  ('register', 1)],
 [('burganoffer', 1),
  ('dis', 1),
  ('enjoy', 1),
  ('jx', 1),
  ('unt', 1),
  ('vdsov', 1)]]

***Implementing LDA model***

In [92]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [93]:
pprint.pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.069*"xc" + 0.015*"change" + 0.014*"burganbanktip" + 0.008*"day" + '
  '0.006*"csmurfs" + 0.006*"lose" + 0.006*"amp" + 0.005*"recruitment" + '
  '0.005*"world" + 0.005*"wt"'),
 (1,
  '0.196*"xdb" + 0.044*"xb" + 0.032*"youth" + 0.028*"burganoffer" + '
  '0.021*"step" + 0.018*"regard" + 0.018*"cbk" + 0.017*"file" + '
  '0.017*"mplaint" + 0.014*"customer"'),
 (2,
  '0.065*"bank" + 0.061*"burgan" + 0.041*"unt" + 0.037*"yawmi" + '
  '0.037*"winner" + 0.037*"congratulation" + 0.022*"kd" + 0.021*"draw" + '
  '0.021*"kuwait" + 0.017*"dis"'),
 (3,
  '0.022*"xf" + 0.021*"burganbank" + 0.015*"banking" + 0.014*"signature" + '
  '0.014*"xba" + 0.014*"travel" + 0.012*"card" + 0.012*"visa" + '
  '0.011*"service" + 0.006*"centralbank"'),
 (4,
  '0.063*"burganbank" + 0.019*"burganbankdraw" + 0.014*"tip" + '
  '0.014*"burganoffer" + 0.010*"know" + 0.009*"zayonal" + 0.008*"mention" + '
  '0.007*"aljohy" + 0.006*"summer" + 0.005*"burganbankcard"')]


In [94]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5512620760737328


In [None]:
#From the above we have got good Coherence Score (Anything more than 0.35)

In [95]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis