# LDA topic modelling

[guide](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/)

In [10]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [11]:
DATA_PATH = '../data/'

In [12]:
!ls {DATA_PATH}

realDonaldTrump.xlsx text.csv


In [13]:
text = pd.read_csv(DATA_PATH+'text.csv', header=None)
text.drop([0],1,inplace=True)
text.head()

Unnamed: 0,1
0,Gina Haspel is one step closer to leading our ...
1,...and voted against the massive Tax Cut Bill....
2,Lou Barletta will be a great Senator for Penns...
3,"Today, it was my great honor to welcome Presid..."
4,House votes today on Choice/MISSION Act. Who w...


In [14]:
# Convert to list
data = text[1].values.tolist()

# Remove new line
# data = [re.sub("\'", "", sent) for sent in data]

# Remove url
data = [re.sub("// ", "//", sent) for sent in data]
data = [re.sub("…", "", sent) for sent in data]
data = [re.sub("www. ", "www.", sent) for sent in data]
data = [re.sub(r"pic.\S+", "", sent) for sent in data]
data = [re.sub(r"http\S+", "", sent) for sent in data]

# Remove mentions @
# data = [re.sub(r"@\S+", "", sent) for sent in data]

pprint(data[1000])

('The Chinese Envoy, who just returned from North Korea, seems to have had no '
 'impact on Little Rocket Man. Hard to believe his people, and the military, '
 'put up with living in such horrible conditions. Russia and China condemned '
 'the launch.')


In [15]:
# Tokenize words and Clean-up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[1000])

['the', 'chinese', 'envoy', 'who', 'just', 'returned', 'from', 'north', 'korea', 'seems', 'to', 'have', 'had', 'no', 'impact', 'on', 'little', 'rocket', 'man', 'hard', 'to', 'believe', 'his', 'people', 'and', 'the', 'military', 'put', 'up', 'with', 'living', 'in', 'such', 'horrible', 'conditions', 'russia', 'and', 'china', 'condemned', 'the', 'launch']


In [16]:
# Build the bigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [17]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [18]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# !python -m spacy.en.download
nlp = spacy.load('en')

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

RuntimeError: Model 'en' not installed. Please run 'python -m spacy.en.download' to install latest compatible model.

In [20]:
!python -m spacy.en.download

Traceback (most recent call last):
  File "/Users/beiming/anaconda3/lib/python3.6/urllib/request.py", line 1318, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "/Users/beiming/anaconda3/lib/python3.6/http/client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/Users/beiming/anaconda3/lib/python3.6/http/client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/Users/beiming/anaconda3/lib/python3.6/http/client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/Users/beiming/anaconda3/lib/python3.6/http/client.py", line 1026, in _send_output
    self.send(msg)
  File "/Users/beiming/anaconda3/lib/python3.6/http/client.py", line 964, in send
    self.connect()
  File "/Users/beiming/anaconda3/lib/python3.6/http/client.py", line 1392, in connect
    super().connect()
  File "/Users/beiming/anaco

In [85]:
en_nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')

RuntimeError: Model 'en' not installed. Please run 'python -m spacy.en.download' to install latest compatible model.

In [86]:
from spacy.en import English

In [87]:
nlp = English()

RuntimeError: Model 'en>=1.1.0,<1.2.0' not installed. Please run 'python -m spacy.en.download' to install latest compatible model.