In [None]:
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.cluster import KMeansClusterer
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import json
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pyLDAvis
import pyLDAvis.sklearn
%matplotlib inline

# 1. Read articles from text file

In [None]:
articles = None
filename = '52_articles(working).txt'
folder_name = 'Local_dataset'
with open(folder_name + '/' + filename) as json_file:  
    articles = json.load(json_file)
df = pd.DataFrame(articles)
df

# 2. Text processing

In [None]:
lemmatizer = WordNetLemmatizer()
pattern = re.compile('^[$€]?[0-9]+(,[0-9]+)*.?[0-9]*[,.]?$')

def matchesNum(term):
    if re.match(pattern, term):
        return(True)
    else:
        return(False)

def removePunctuation(term):
    return term.translate(str.maketrans('', '', string.punctuation))

def updateNumbers(text):
    split_text = text.split(' ')
    updated_split_text=[]
    term_num=len(split_text)
    i = 0
    while i < term_num:
        if matchesNum(split_text[i]):
        
            if (i+1 < term_num) and (removePunctuation(split_text[i+1])=='million' or removePunctuation(split_text[i+1])=='billion'):
                new_word = split_text[i].translate(str.maketrans('', '', ','))+removePunctuation(split_text[i+1])
                i+=1
            else:
                new_word = split_text[i].translate(str.maketrans('', '', ','))
            new_word = new_word.rstrip('.')
            new_word = new_word.replace('.','dot')
            new_word = new_word.replace('$','dol')
            new_word = new_word.replace('€','eur')
            updated_split_text.append(new_word)
        else:
            updated_split_text.append(split_text[i])
        i+=1
    return ' '.join(updated_split_text)

def process_text(text):
    updated_text = updateNumbers(text)
    words = nltk.word_tokenize(updated_text)
    words = [w.lower() for w in words]
    words = [w.translate(str.maketrans('', '', string.punctuation)) for w in words]
    words = [w for w in words if w not in nltk.corpus.stopwords.words('english')]
    words = [lemmatizer.lemmatize(w) for w in words]
    words = [w for w in words if w.isalnum()]
    new_text = " ".join(words)
    return new_text

# 2.a Process "body" of the article

In [None]:
df['body_processed'] = [process_text(b) for b in df['body']]
df

# (Optional) Split articles (i.e. old articles vs new article)

In [None]:
# df_new =df.iloc[:1,:]
# df_old = df.iloc[1:,:]

# LDA

In [None]:
df['date'] =pd.to_datetime(df['date'])
df.index = df['date']
df = df.sort_index(ascending=True)
df

In [None]:
class NCA:
    
    def __init__(self,depth=1,init_topics=-1):
        self.depth = depth
        self.init_topics = init_topics
        self.models = []
        
    def fit(self,df):
        current_topics = self.init_topics
        if current_topics == -1:
            current_topics = int(df.shape[0]/3) + 1
    
        for i in range(self.depth):
            
            #perfom fitting and transforming
            vect = TfidfVectorizer(sublinear_tf=True,ngram_range=(1,3),max_df=0.7).fit(df['body_processed'])
            X_train = vect.transform(df['body_processed'])
            lda = LatentDirichletAllocation(n_components=current_topics,random_state=0,max_iter=400,learning_method='batch')
            lda.fit(X_train)
            transformed = lda.transform(X_train)
            
            #save model
            model = {}
            model['lda'] = lda
            model['transformed'] = transformed
            model['topics'] = current_topics
            model['vect'] = vect
            model['X_train'] = X_train
            self.models.append(model)
            
            topic_result = {}
            for i in range(len(transformed)):
                max_num = -99
                max_index = -99
                for j in range(len(transformed[i])):
                    if transformed[i][j] >= max_num:
                        max_num = transformed[i][j]
                        max_index = j
                if max_index not in topic_result:
                    topic_result[max_index] = []
                topic_result[max_index].append(i)
                
            if i != self.depth-1:
                index = df.shape[0]-1
                subtopic = None
                for t in topic_result:
                    for a in topic_result[t]:
                        if a == index:
                            subtopic=t
                            break

                df = df.iloc[topic_result[subtopic]]
                current_topics = int(df.shape[0]/3) + 1
            
    def stacked_barplot(self,depth=-1):
        
        if depth == -1:
            depth = self.depth
        
        for i in range(depth):
            current_model = self.models[i]
            transformed = current_model['transformed']
            transformed_copy = list(transformed)
            topics = current_model['topics']
            
            plt.figure()
            x=range(0, topics)
            prev = None
            first = True
            width = 0.5
            bars = []
            for i in range(len(transformed_copy)): 
                p = plt.bar(x, transformed_copy[i], width, bottom=prev)
                if first:
                    prev = transformed_copy[i]
                    first = False
                else:
                    prev += transformed_copy[i]
            bars.append(p)
            if len(transformed_copy) <= 15:
                articles_num = range(1,len(transformed_copy)+1)
                plt.legend([p[0] for p in bars],[a for a in articles_num])
                
    def pyldavis(self,depth=-1):
        if depth == -1:
            depth = self.depth
        
        pyLDAvis.enable_notebook()
        for i in range(depth):
            current_model = self.models[i]   
            panel = pyLDAvis.sklearn.prepare(current_model['lda'], current_model['X_train'], current_model['vect'], mds='tsne')
            panel
            
    def print_maxtopic(self,depth=-1):
        if depth == -1:
            depth = self.depth
        print("************************************************************")
        for z in range(depth):
            current_model = self.models[z]
            transformed = current_model['transformed']            
            for i in range(len(transformed)):
                max_num = -99
                max_index = -99
                for j in range(len(transformed[i])):
                    if transformed[i][j] >= max_num:
                        max_num = transformed[i][j]
                        max_index = j
                print('max topic of doc ' + str(i) + ' is: ' + str(max_index))
                
            print("*********************************************************")
        
        

In [None]:
nca = NCA(depth=2)
nca.fit(df)

In [None]:
nca.print_maxtopic()

In [None]:
nca.stacked_barplot()