In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import json

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

import nltk
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('words')

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

pd.set_option('display.max_colwidth', -1)

[nltk_data] Downloading package stopwords to /home/aamir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/aamir/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
df = pd.read_csv('/home/aamir/datasets/brexit.csv', header=0, encoding='utf-8')

In [3]:
# drop duplicate tweets
df.drop_duplicates(inplace=True)

In [4]:
df.shape

(24820, 1)

In [5]:
# there are some parts of the tweets that in fact does not help us to analyze its sentiment, 
# like URLs, some other user_ids, numbers, etc

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)        
    return input_txt

def clean_tweets(lst):
    # remove twitter Return handles (RT @xxx:)
    lst = np.vectorize(remove_pattern)(lst, "RT @[\w]*:")
    # remove twitter handles (@xxx)
    lst = np.vectorize(remove_pattern)(lst, "@[\w]*")
    # remove URL links (httpxxx)
    lst = np.vectorize(remove_pattern)(lst, "https?://[A-Za-z0-9./]*")
    # remove special characters, numbers, punctuations (except for #)
    lst = np.core.defchararray.replace(lst, "[^a-zA-Z#]", " ")

    return lst

In [6]:
df['tweet'] =  clean_tweets(df['tweet'])

In [7]:
# Convert to list
data = df.tweet.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove newline character \n or \\n
data = [re.sub(r"(?<!\\)\\n|\n", ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

# Removing string of length 1
data = [re.sub(r'(?:\b\w{,1}\s|\s\w{,1}\b|\b\w{,1}\b)', ' ', sent) for sent in data]

# Remove white spaces characters
data = [re.sub('\s+', ' ', sent) for sent in data]

pprint(data[:3])

[' just published "# TheStorm : How to prepare for global corruption purge ?" '
 '# FutureOfComms # MAGA …',
 ' If you want to repost this elsewhere , feel free . Hopefully it annoys the '
 'right people . ',
 ' So , is the # Johnson bluster about # NoDealBrexit simply to get MPs to '
 'accept instead bad - deal # Brexit ? He wants to bo …']


In [8]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:2])

[['just', 'published', 'thestorm', 'how', 'to', 'prepare', 'for', 'global', 'corruption', 'purge', 'futureofcomms', 'maga'], ['if', 'you', 'want', 'to', 'repost', 'this', 'elsewhere', 'feel', 'free', 'hopefully', 'it', 'annoys', 'the', 'right', 'people']]


In [9]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['just', 'published', 'thestorm', 'how', 'to', 'prepare', 'for', 'global', 'corruption', 'purge', 'futureofcomms', 'maga']


In [10]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'nan', 'amp', 'xa', 'xb', 'xc', 'xf', 'xe', 'co'])

In [11]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [12]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:2])

[['publish', 'thestorm', 'prepare', 'global', 'corruption', 'purge', 'futureofcomms', 'maga'], ['want', 'repost', 'elsewhere', 'feel', 'free', 'hopefully', 'annoy', 'right', 'people']]


In [13]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]]


In [14]:
# Topic modelling-NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import decomposition

In [15]:
outlst = [' '.join([str(c) for c in lst]) for lst in texts]

In [16]:
makeitastring = ''.join(map(str, outlst))

In [17]:
vectorizer = TfidfVectorizer()

In [18]:
word_text = makeitastring.split()

In [19]:
dtm = vectorizer.fit_transform(word_text)

In [20]:
vocab = vectorizer.get_feature_names()

In [21]:
# change n_components to desired number of topics
clf = decomposition.NMF(n_components=10,random_state=1)

In [22]:
topic = clf.fit_transform(dtm)

In [23]:
topic_words = []
for topic in clf.components_:
    word = np.argsort(topic)[::-1][0:20]
    topic_words.append([vocab[i] for i in word])

In [24]:
for i in range (len(topic_words)):
    print("Topic {}: {} \n".format(i,' '.join(topic_words[i])))

Topic 0: brexit ｅｕ離脱を選択 explainlate explanationstill explanationbe explanation explainunited_kingdom explaintake explainproud explainocff explainleicester explainintoday explica explaininlisten explaininbrexit explaingerman explainfront explainfeednavigator explainer explainbrexit 

Topic 1: uk make would eu be el brexitbrexit remain party good country government stop year may trade could back business call 

Topic 2: vote go do make would want referendum think brexitbrexit take party see country government stop mean year news trade work 

Topic 3: not would britain be think el know take new time come stop mean news day europe back happen world tell 

Topic 4: deal make britain eu referendum be el know brexitbrexit british remain party good time need stop come mean year may 

Topic 5: leave make eu be el british know remain party good new government come stop year may trade day business happen 

Topic 6: say make britain eu referendum el british remain party see good time need country 