In [None]:
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.cluster import KMeansClusterer
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import json
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pyLDAvis
import pyLDAvis.sklearn
import math
import copy
from wordcloud import WordCloud
import scipy
%matplotlib inline

# 1. Read articles from text file

In [None]:
articles = None
filename = 'brexit_new.txt'
folder_name = 'Master_dataset'

with open(folder_name + '/' + filename) as json_file:  
    articles = json.load(json_file)
df = pd.DataFrame(articles)
df

# 2. Text processing

In [None]:
lemmatizer = WordNetLemmatizer()

pattern = re.compile('^[$€]?[0-9]+(,[0-9]+)*.?[0-9]*[,.]?$')

# tests if a given term matches the reg ex above, which represents a number
def matchesNum(term):
    if re.match(pattern, term):
        return(True)
    else:
        return(False)

# removes punctuation from a term
def removePunctuation(term):
    return term.translate(str.maketrans('', '', string.punctuation))

# updates all numbers in a text file
def updateNumbers(text):
    split_text = text.split(' ')
    updated_split_text=[]
    term_num=len(split_text)
    i = 0
    
    # tests if term matches a number, updating numerical term to be properly interpreted by the later text processing steps
    # for example "The company made $5.1 billion this year" gets translated to "The company made dol5dot1billion this year"
    while i < term_num:
        if matchesNum(split_text[i]):
        
            if (i+1 < term_num) and (removePunctuation(split_text[i+1])=='million' or removePunctuation(split_text[i+1])=='billion'):
                new_word = split_text[i].translate(str.maketrans('', '', ','))+removePunctuation(split_text[i+1])
                i+=1
            else:
                new_word = split_text[i].translate(str.maketrans('', '', ','))
            new_word = new_word.rstrip('.')
            new_word = new_word.replace('.','dot')
            new_word = new_word.replace('$','dol')
            new_word = new_word.replace('€','eur')
            updated_split_text.append(new_word)
        else:
            updated_split_text.append(split_text[i])
        i+=1
    return ' '.join(updated_split_text)

# process text using lemmatization
def process_text(text):
    updated_text = updateNumbers(text)
    words = nltk.word_tokenize(updated_text)
    words = [w.lower() for w in words]
    words = [w.translate(str.maketrans('', '', string.punctuation)) for w in words]
    words = [w for w in words if w not in nltk.corpus.stopwords.words('english')]
    words = [lemmatizer.lemmatize(w) for w in words]
    words = [w for w in words if w.isalnum()]
    new_text = " ".join(words)
    return new_text

# 2.a Process "body" of the article

In [None]:
df['body_processed'] = [process_text(b) for b in df['body']]
df

# 3. Sort articles in ascending order (oldest to newest)

In [None]:
df['date'] = pd.to_datetime(df['date'])
df.index = df['date']
df = df.sort_index(ascending=True)

# 4. New Content Analyzer (NCA)

In [None]:
class NCA:
    
    def __init__(self,depth=1,init_topics=-1):
        self.depth = depth #how many times to perform lda
        self.init_topics = init_topics #n of topics for the first lda. Set this if you have a priori knowledge of the topics present in the articles
        self.models = [] #list of models
    
    # returns True is unread article contains new content, otherwise returns false
    # "unread" article must be the last row of the dataframe (so if ordered by time, this would be the most recent article)
    def predict(self,df,include_stop_ngrams=True):
        current_topics = self.init_topics
        level = 0 #current level of the model
        
        if current_topics == -1:
            current_topics = min(20,min(math.ceil(df.shape[0]/3) + 1, df.shape[0]))
            
        for i in range(self.depth):
            #perform fitting and transforming
            vect = TfidfVectorizer(sublinear_tf=True,ngram_range=(1,3),max_df=0.5).fit(df['body_processed'])
            X_train = None
            
            #remove/not remove top stop-ngrams
            if not include_stop_ngrams:
                X_train = self.remove_stop_ngrams(current_topics,df,vect)
                include_stop_tokens = True
            else:
                X_train = vect.transform(df['body_processed'])
            
            lda = LatentDirichletAllocation(n_components=current_topics,random_state=0,max_iter=400,learning_method='batch')
            lda.fit(X_train)
            transformed = lda.transform(X_train)
            
            #save model
            model = {}
            model['lda'] = lda
            model['transformed'] = transformed
            model['topics'] = current_topics
            model['vect'] = vect
            model['X_train'] = X_train
            model['df'] = df
            self.models.append(model)
            
            #assign the highest-weighted topic as the main topic
            topic_result = {}
            for i in range(len(transformed)):
                max_num = -99
                max_index = -99
                for j in range(len(transformed[i])):
                    if transformed[i][j] >= max_num:
                        max_num = transformed[i][j]
                        max_index = j
                if max_index not in topic_result:
                    topic_result[max_index] = []
                topic_result[max_index].append(i)
                
            index = df.shape[0]-1 #index of the "unread" article
            subtopic = None #main topic assigned to "unread" article
            for t in topic_result:
                for a in topic_result[t]:
                    if a == index:
                        subtopic=t
                        break
            
            #return true if the topic assigned to the "unread" article is unique across all "read" articles
            if len(topic_result[subtopic]) == 1:
                self.depth = level+1
                return True                                
                            
            if i != self.depth-1:
                df = df.iloc[topic_result[subtopic]] #reduce dataset to only articles assigned as the same topic as the "unread" article
                current_topics = min(20,min(math.ceil(df.shape[0]/3) + 1, df.shape[0]))
                level += 1

        return False
     
    #removes the stop ngrams (i.e. most frequent ngrams), function not used in our final model
    def remove_stop_ngrams(self,current_topics,df,vect):
        X_train = vect.transform(df['body_processed'])
        lda = LatentDirichletAllocation(n_components=current_topics,random_state=0,max_iter=400,learning_method='batch')
        lda.fit(X_train)
        transformed = lda.transform(X_train)
        
        topic_val = [0] * current_topics
        for t in transformed:
            for i in range(len(t)):
                topic_val[i] += t[i]
                
        outlier_indexes = self.outlier_detection(topic_val)
        
        #remove stop tokens from X_train
        df_vec = pd.DataFrame(X_train.toarray())
        df_vec.columns = vect.get_feature_names()
        for topic_index in outlier_indexes:
            top_tokens = self.get_top_ngrams(lda,vect.get_feature_names(),topic_index)
            df_vec.drop(top_tokens, axis=1, inplace=True)
            
        X_train_processed = scipy.sparse.csr_matrix(df_vec.values)
        return X_train_processed
    
    #get top ngrams in topic
    def get_top_ngrams(self,lda_model, feature_names, topic,n_top_tokens=20):
        word_distrib = lda_model.components_[topic]
        sorted_index = word_distrib.argsort()[:-n_top_tokens - 1:-1]
        top_words = []
        
        for i in sorted_index:
            top_words.append(feature_names[i])
    
        return top_words
    
    #detect outlier based on IQR
    def outlier_detection(self,arr):
        q1 = np.percentile(arr, 25)
        q3 = np.percentile(arr, 75)
        iqr = q3 - q1
        ceiling = q3 + 1.5*iqr
        outlier_indexes = []
        for i in range(len(arr)):
            if arr[i] > ceiling:
                outlier_indexes.append(i)
        return outlier_indexes
    
    # plot bar chart of topic distrubution
    def stacked_barplot(self,depth=-1):
        
        if depth == -1:
            depth = self.depth
            
        for i in range(depth):
            current_model = self.models[i]
            transformed = current_model['transformed']
            transformed_copy = copy.deepcopy(transformed)
            topics = current_model['topics']
            
            plt.figure()
            plt.xlabel('Topics')
            plt.ylabel('Topic Weight')
            plt.xticks(np.arange(0,topics+1,1))
            plt.title('Topic Distribution by Article at Depth ' + str(i+1))
            
            x=range(0, topics)
            prev = None
            first = True
            width = 0.5
            bars = []
            for i in range(len(transformed_copy)): 
                p = plt.bar(x, transformed_copy[i], width, bottom=prev)
                if first:
                    prev = transformed_copy[i]
                    first = False
                else:
                    prev += transformed_copy[i]
                bars.append(p)
            
            #this is to avoid having a big legend that would cover the graph 
            if len(transformed_copy) <= 15:
                articles_num = range(1,len(transformed_copy)+1)
                plt.legend([p[0] for p in bars],[a for a in articles_num])
    
    # plot pyldavis
    def pyldavis(self,depth=-1):
        
        if depth == -1:
            depth = self.depth
            
        pyLDAvis.enable_notebook()
        for i in range(depth):
            print("PyLDAvis at depth: " + str(i+1))
            current_model = self.models[i]
            panel = pyLDAvis.sklearn.prepare(current_model['lda'], current_model['X_train'], current_model['vect'], mds='tsne')
            display(panel)
    
    #get the topic assignment for each article
    def topic_distribution(self,depth=-1):
        
        if depth == -1:
            depth = self.depth
        print("************************************************************")
        
        for z in range(depth):
            current_model = self.models[z]
            transformed = current_model['transformed'] 
            print("Topic distribution at depth: " + str(z+1))
            print()

            for i in range(len(transformed)):
                max_num = -99
                max_index = -99
                
                for j in range(len(transformed[i])):
                    if transformed[i][j] >= max_num:
                        max_num = transformed[i][j]
                        max_index = j
                print('max topic of doc ' + str(i) + ' is: ' + str(max_index))
                
            print("*********************************************************")
    
    #helper function from Prof. Vosoughi to print the word distribution
    def print_LDA_results(self,lda_model, feature_names, n_top_words=50):
        for topic_idx, topic in enumerate(lda_model.components_):
            message = "Topic %d: " % topic_idx
            message += " ".join([("*" + feature_names[i] + "*")
                                 for i in topic.argsort()[:-n_top_words - 1:-1]])
            print(message)
            print()
    
    #print the word distribution (in descending weight) for each topic
    def print_topics(self,depth=-1):
        if depth == -1:
            depth = self.depth
        print("************************************************************")
        for i in range(depth):
            print("Topics at depth: " + str(i+1))
            print()
            
            current_model = self.models[i]
            lda = current_model['lda']
            vect = current_model['vect']
            self.print_LDA_results(lda,vect.get_feature_names())
            print("************************************************************")
            
    def word_cloud(self,depth=-1):
        
        if depth == -1:
            depth = self.depth
        
        for i in range(depth):
            print("Word clouds at depth: " + str(i+1))
            current_model = self.models[i]
            df = current_model['df']
            df_old = df.iloc[:df.shape[0]-1]
            words = ""
            for text in df_old['body_processed']:
                words+= " " + text
                
            #word cloud of all "read" articles
            wordcloud = WordCloud().generate(words)
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis("off")
            plt.show()
            plt.figure()
            
            #word cloud of the "unread" article
            words = df.iloc[df.shape[0]-1]['body_processed']
            wordcloud = WordCloud().generate(words)
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis("off")
            plt.show()
            plt.figure() 
            print("************************************************************")

# 5. Predict

In [None]:
nca = NCA(depth=2)
if nca.predict(df):
    print('The unread article contains new content')
else:
    print('The unread article does not contain new content')

In [None]:
nca.stacked_barplot()

In [None]:
nca.topic_distribution()

In [None]:
nca.print_topics()

In [None]:
nca.word_cloud()

In [None]:
nca.pyldavis()