In [None]:
#import libraries

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import re
import glob
import os
import string
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import matplotlib.pyplot as plt
from models import dynamic_lda

lemmatizer = WordNetLemmatizer()
tokenizer=RegexpTokenizer(r'\w+')
stop_words = stopwords.words('english')


In [None]:
#global variables

dataset_dir = "D:/Masters/4/Probabilistic Graphical Models/data/" #set the directory of dataset

num_articles = 10 #Number of news articles to be considered from each time slice

yearwise = True #Select between yearwise or decadewise analysis (yearwise is faster than decadewise)

start_year = 2016 #choose start year in range of years to be analysed

end_year = 2018 #choose end year (both inclusive range)

num_topics = 5 #number of topics to be found in the dataset

#increasing the num_articles and year range will increase model training time

In [None]:
#construct dataframe from csv files

def data_construction(dataset_dir,yearwise):
    year_files = glob.glob(os.path.join(dataset_dir, "*.csv"))
    df = pd.DataFrame()
    if yearwise:
        years = []
        articles_per_time = []
        for x in range(start_year,end_year+1):
            years.append(str(x))
            articles_per_time.append(num_articles)

        for file in year_files:
            if any(year in file for year in years):
                year_df = pd.read_csv(file,index_col = 0)
                year_df = year_df.rename({'sentence': 'article'}, axis=1).drop_duplicates()
                year_df = year_df.sort_values(by="article", key=lambda x: x.str.len(),ignore_index=True, ascending=False)[:num_articles]
                df = df.append(year_df)
    else:
        decades = []
        articles_per_time = []
        for decade_firstyear in range(0,len(year_files),10):
            decade = year_files[decade_firstyear:decade_firstyear+10]
            decade_df = pd.DataFrame()
            years = []
            for year in decade:
                years.append(pd.read_csv(year,index_col = 0))
            decade_df = pd.concat(years, ignore_index=True).drop_duplicates()
            decade_df = decade_df.sort_values(by="sentence", key=lambda x: x.str.len(),ignore_index=True, ascending=False)[:num_articles]
            decades.append(decade_df)
            articles_per_time.append(num_articles)
        df = pd.concat(decades, ignore_index=True)
        df = df.rename({'sentence': 'article'}, axis=1)

    df = df.reset_index().drop(columns=['index'])
    return df,articles_per_time

df,articles_per_time = data_construction(dataset_dir,yearwise)
df.info()

In [None]:
#helper functions for data preprocessing

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) 

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts):
    texts_out = []
    for sent in texts:
        texts_out.append([lemmatizer.lemmatize(w) for w in sent])
    return texts_out

In [None]:
#data preprocessing

def data_preprocessing(df):
    # Convert to list
    data = df.article.values.tolist()

    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]
    
    data_words = list(sent_to_words(data))
    
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
    
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    
    data_words_nostops = remove_stopwords(data_words)

    data_words_bigrams = [bigram_mod[doc] for doc in data_words_nostops]
    data_words_trigrams = [trigram_mod[bigram_mod[doc]] for doc in data_words_bigrams]
    
    data_lemmatized = lemmatization(data_words_trigrams)
    
    return data_lemmatized

preprocessed_data = data_preprocessing(df)
preprocessed_data[0]

In [None]:
#create dictionary and corpus

def get_corpus_id2word(data):
    id2word = corpora.Dictionary(data)
    corpus = [id2word.doc2bow(text) for text in data]
    return corpus,id2word

corpus,id2word = get_corpus_id2word(preprocessed_data)

In [None]:
#train the dynamic_lda_model

dyn_lda = dynamic_lda.Dynamic_LdaModel(corpus=corpus, id2word=id2word, 
                                       articles_per_time = articles_per_time, 
                                       num_topics=num_topics)

In [None]:
# get_topic function provides the top 20 words given topic and time slice

print(dyn_lda.get_topic(topic=0,time=1))