In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re 

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy 

import pyLDAvis
import pyLDAvis.gensim

data = pd.read_csv("master_dataset/merged_cleaned.csv")
data.head()

Unnamed: 0,title,text,subject,date,class,text_without_stopwords,title_without_stopwords
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1,Donald Trump wish Americans Happy New Year lea...,Donald Trump Sends Out Embarrassing New Year’s...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1,House Intelligence Committee Chairman Devin Nu...,Drunk Bragging Trump Staffer Started Russian C...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1,"On Friday, revealed former Milwaukee Sheriff D...",Sheriff David Clarke Becomes An Internet Joke ...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1,"On Christmas day, Donald Trump announced would...",Trump Is So Obsessed He Even Has Obama’s Name ...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1,Pope Francis used annual Christmas Day message...,Pope Francis Just Called Out Donald Trump Duri...


In [2]:
#text-based features
#word count

import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict, stopwords
from textblob import TextBlob
import syllables

nltk.download('punkt')
nltk.download('stopwords')
def word_count(text):
    text = str(text).lower()
    text = text.replace("\r\n", ' ')
    if text == "no title":
        return 0
    else:
        return len(str(text).split(' '))

def sentence_count(text):
    sentences = nltk.sent_tokenize(text)
    return len(sentences)

def average_word_length(text):
    words = text.split()
    count = 0
    for word in words:
        count += len(word)
    if len(words) == 0:
        return 0
    else:
        return count / len(words)
        
def punctuation_count(text):
    count = 0
    for word in text:
        if word in string.punctuation:
            count += 1
    return count

def stopword_count(text):
    stopword = stopwords.words('english')
    count = 0
    for word in text.split():
        if word in stopword:
            count += 1
    return count

#change data type to string
data['text'] = data['text'].astype(str)
data['title'] = data['title'].astype(str)

data['text_word_count'] = data['text'].apply(word_count)
data['title_word_count'] = data['title'].apply(word_count)

data['text_sentence_count'] = data['text'].apply(sentence_count)
data['title_sentence_count'] = data['title'].apply(sentence_count)

data['text_average_word_length'] = data['text'].apply(average_word_length)
data['title_average_word_length'] = data['title'].apply(average_word_length)

data['text_punctuation_count'] = data['text'].apply(punctuation_count)
data['title_punctuation_count'] = data['title'].apply(punctuation_count)

data['text_stopwords_count'] = data['text'].apply(stopword_count)
data['title_stopwords_count'] = data['title'].apply(stopword_count)

ModuleNotFoundError: No module named 'textblob'

In [None]:
data

### Flesch Sensitivity

In [None]:
#Calculating number of syllables in a word
def nsyl(word):
    return syllables.estimate(word) 

#Calculating number of syllables in a text 
def syllables_text(text):
    syllable_count = sum(list(map(lambda w: nsyl(w), word_tokenize(text))))
    return syllable_count

data['syllables'] = data['text'].apply(syllables_text)

In [None]:
# Flesch-Kincaid Readability Metric
def flesch_formula(word_count, sent_count, syllable_count):
    if sent_count == 0:
        return 0
    else:
        return 206.835 - 1.015*word_count/sent_count - 84.6*syllable_count/word_count
    
# Get flesch readability
data['flesch_readability'] = data.apply(lambda n: flesch_formula(n['text_word_count'],n['text_sentence_count'],n['syllables']),axis=1)
data['flesch_readability'] = (data['flesch_readability'] - data['flesch_readability'].mean()) / data['flesch_readability'].std()

In [None]:
# Calculate subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
  
# Calculate polarity 
def getPolarity(text):
    return TextBlob(text).sentiment.polarity
  
#Get subjectivity and polarity
data['subjectivity'] = data['text'].apply(getSubjectivity)
data['polarity'] = data['text'].apply(getPolarity)

### Topic Modelling

In [None]:
import re 
data['text_without_stopwords'] = data['text_without_stopwords'].map(lambda x: re.sub('[,\.!?]', '', x))
data['title_without_stopwords'] = data['title_without_stopwords'].map(lambda x: re.sub('[,\.!?]', '', x))
data['text_without_stopwords'] = data['text_without_stopwords'].map(lambda x: x.lower())
data['title_without_stopwords'] = data['title_without_stopwords'].map(lambda x: x.lower())
data['overall_content'] = data['title_without_stopwords'] + ' ' + data['text_without_stopwords']

#### Next, we create a Python function to lemmatize the content in our news.

In [None]:
def lemmatize(text, allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]):
#we only consider the nouns, adjective, verbs and adverbs as these are the POS tags which give our text most contextual meaning 
    nlp = spacy.load("en_core_web_sm", exclude = ["parser", "ner"])
    output = []
    for content in text:
        contents = nlp(content)
        temp = []
        for word in contents:
            if word.pos_ in allowed_postags: 
                temp.append(word.lemma_)
        lemmatized_content = " ".join(temp)
        output.append(lemmatized_content)
    return output 

texts = list(data['overall_content'])
output = lemmatize(texts)

Next, we implement a preprocessing function to tokenize our text. 
We will do this by implementing an iterative function which uses the simple_preprocess function from the gensim library.

In [None]:
def preprocess(text):
    result = []
    for article in text:
        temporary = gensim.utils.simple_preprocess(article)
        result.append(temporary)
    return result
all_words = preprocess(output)

To improve the results of our topic modelling later on, we also identify bigrams and trigrams in our text, which are phrases of two words and three words respectively which appear commonly in our text. 

In [None]:
bigram_phrases = gensim.models.Phrases(all_words, min_count = 5, threshold = 50) #min_count and threshold are hyperparameters we can tune later
trigram_phases = gensim.models.Phrases(bigram_phrases[all_words], min_count = 5, threshold = 50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(bigram_phrases)

In [None]:
new_corpus_bigrams = []
for text in all_words:
    new_corpus_bigrams.append(bigram[text])
    
new_corpus_trigrams = []
for text in all_words:
    new_corpus_trigrams.append(trigram[bigram[text]])
    
for i in range(len(new_corpus_bigrams)):
    if set(new_corpus_bigrams[i]) != set(new_corpus_trigrams[i]):
        print(i)

Currently it appears that among all our texts, only one article contains trigrams. However, since all trigrams are also considered bigrams, we will use the trigrams to generate our corpus for the topic modelling classification.

Following this, we generate the corpus that we will use for our topic modelling. 

In [None]:
id2word = corpora.Dictionary(new_corpus_trigrams)

corpus = []
for text in new_corpus_trigrams:
    frequencies = id2word.doc2bow(text) #counts occurence of each word in every document
    corpus.append(frequencies) #stores into corpus, which becomes a list of lists

In addition, we also filter out low value words in our corpus which do not add much meaning into the text using the TF-IDF statistical measure.

In [None]:
tfidf = TfidfModel(corpus, id2word = id2word)

low_value = 0.05 #set a TF-IDF score of 0.05 as a threshold 
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]         
    corpus[i] = new_bow

In [None]:
%store corpus 
%store id2word

We are almost ready to generate our LDA model. However, we first need to determine the number of topics to train our model on. In order to determine this, we will use the coherence score metric to do. Therefore, we first implement a Python function that will allow us to determine a coherence score for a given LDA model. 

In [None]:
def calculate_coherence_score(n):
    lda = gensim.models.ldamodel.LdaModel(corpus= corpus,
                                          id2word = id2word,
                                          num_topics = n,
                                          random_state = 4222, 
                                          update_every = 1,
                                          chunksize = 2000,
                                          passes = 10,
                                          alpha = "auto")
    coherence_model_lda = CoherenceModel(model = lda, corpus = corpus, dictionary = id2word, coherence = 'u_mass')
    coherence_score = coherence_model_lda.get_coherence()
    return coherence_score

A list of possible number of topics is given below under topics_list. In addition, we will also store the coherence scores in a list for data visualisation purposes later. 

In [None]:
topics_list = [3,4,5,6,7,8,9,10] 
scores = [] 

for n in topics_list:
    coherence_score = calculate_coherence_score(n)
    scores.append(coherence_score)
    print(f"n : {n} ; Coherence Score : {coherence_score}")

In [None]:
x = np.array(topics_list)
y = np.array(scores)

plt.plot(x,y)
plt.xticks(x)
plt.show()

From the plot above, the coherence score is the highest when there are 5 topics. Hence, we will train our LDA model by setting n_topics = 5.

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus, 
                                            num_topics = 5,
                                            id2word = id2word,
                                            chunksize = 2000,
                                            passes = 10,
                                            update_every = 1,
                                            alpha = 'auto',
                                            random_state = 4222)

%store lda_model

In [None]:
outputs = []
for i in range(len(corpus)):
    topic_distribution = lda_model.get_document_topics(corpus[i], minimum_probability = 0.0)
    outputs.append(topic_distribution)
    
import numpy as np
outputs = np.array(outputs).T.tolist()
outputs = outputs[1:]

data['Topic 1 Probability'] = outputs[0][0]
data['Topic 2 Probability'] = outputs[0][1]
data['Topic 3 Probbility'] = outputs[0][2]
data['Topic 4 Probability'] = outputs[0][3]
data['Topic 5 Probability'] = outputs[0][4]

In [None]:
# Write the newly cleaned dataframe in new file
data.to_csv("processed_data.csv", index=False) # Dataset with text feature

In [None]:
# Categorise polarity and do one-hot encoding 
data['polarity_category'] = pd.cut(x=data['polarity'], bins=[-1,-0.05,0.05,1], labels=['Negative', 'Neutral', 'Positive'])
dummy = pd.get_dummies(data['polarity_category'], prefix='polarity_category', drop_first=True)
data = pd.concat([data,dummy], axis=1)