# Imports

In [114]:
import glob
import nltk
import sklearn
import pandas
import bokeh

# Data

In [4]:
import glob
import os

base_dir = "BBC News Summary/News Articles"
# read news
business_file_list = glob.glob(os.path.join(os.getcwd(), base_dir, "business", "*.txt"))
entertainment_file_list = glob.glob(os.path.join(os.getcwd(), base_dir, "entertainment", "*.txt"))
politics_file_list = glob.glob(os.path.join(os.getcwd(), base_dir, "politics", "*.txt"))
sport_file_list = glob.glob(os.path.join(os.getcwd(), base_dir, "sport", "*.txt"))
tech_file_list = glob.glob(os.path.join(os.getcwd(), base_dir, "tech", "*.txt"))

labels = []
corpus = []
for file_list in [business_file_list, entertainment_file_list, politics_file_list, sport_file_list, tech_file_list]:
    for file_path in file_list:
        with open(file_path, encoding="utf8", errors='ignore') as f_input:
            corpus.append((f_input.read()))
            labels.append(file_path.split('/')[-2])

# Preprocessing

## Tokenisation

In [5]:
import nltk
import re


regex = re.compile('[^a-zA-Z]')

def tokenize(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            clean_word = regex.sub('', word)
            tokens.append(clean_word.lower())
    return tokens

In [6]:
tokenized = tokenize(corpus[500])
tokenized[:10]

['fiat',
 'mulls',
 'ferrari',
 'market',
 'listing',
 'ferrari',
 'could',
 'be',
 'listed',
 'on']

## Stopwords

In [7]:
from nltk.corpus import stopwords as sw
stopwords = sw.words('english')

In [8]:
cleaned = [word for word in tokenized if word not in stopwords and word is not '']
cleaned[:10]

['fiat',
 'mulls',
 'ferrari',
 'market',
 'listing',
 'ferrari',
 'could',
 'listed',
 'stock',
 'market']

## Steming

In [9]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

def stem(word):
    return stemmer.stem(word).strip()

In [10]:
stemed = [stem(word) for word in cleaned]
stemed[:10]

['fiat',
 'mull',
 'ferrari',
 'market',
 'list',
 'ferrari',
 'could',
 'list',
 'stock',
 'market']

# Plot word distributions

In [115]:
def prep(corp):  
    # tokenization
    tokenized = tokenize(corp)
    # stopwords
    cleaned = [word for word in tokenized if word not in stopwords and word is not '']
    # steming
    stemed = [stem(word) for word in cleaned]
    return stemed
    
stemed1 = prep(corpus[200])
stemed500 = prep(corpus[500])

In [116]:
def h(stem):
    data = []
    for w in set(stem):
        data.append((stem.count(w), w))
    data.sort(reverse=True)
    return data

hist_data1 = h(stemed1)
hist_data500 = h(stemed500)

def xy(data):   
    l = range(len(data))
    x = [data[i][1] for i in l]
    y = [data[i][0] for i in l]
    return x,y

x1,y1 = xy(hist_data1)
x500,y500 = xy(hist_data500)

In [117]:
from bokeh.io import show, output_file
from bokeh.plotting import figure

output_file("skewed_distributions.html")

def plot(x,y, title):
    p = figure(x_range=x, plot_width=1200, plot_height=400, title=title,
               toolbar_location=None, tools="")

    p.vbar(x=x, top=y, width=0.9, line_color="white", fill_color="navy", alpha=0.5)

    p.xgrid.grid_line_color = None
    p.xaxis.major_label_orientation = 1.2
    p.y_range.start = 0
    return p

In [118]:
from bokeh.layouts import gridplot
p1 = plot(x1,y1,"Document #200 word count")
p2 = plot(x500,y500,"Document #500 word count")
show(gridplot([p1,p2], ncols=1))

# TfidfVectorizer

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000,
                                 min_df=0.05, stop_words=stopwords,
                                 use_idf=True, tokenizer=tokenize,
                                  lowercase=True, preprocessor=stem)

In [26]:
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus);

# LatentDirichletAllocation

In [17]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(tfidf_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [18]:
for i,topic in enumerate(lda.components_):
    print(f'Topic #{i}:')
    print([tfidf_vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Topic #0:
['award', 'awards', 'party', 'brown', 'election', 'show', 'labour', 'best', 'mr', 'film']


Topic #1:
['sales', 'economy', 'bank', 'growth', 'firm', 'market', 'year', 'company', 'us', 'bn']


Topic #2:
['first', 'injury', 'players', 'match', 'cup', 'club', 'england', 'nt', 'win', 'game']


Topic #3:
['court', 'minister', 'secretary', 'scotland', 'people', 'law', 'would', 'government', 'wales', 'mr']


Topic #4:
['net', 'computer', 'phone', 'digital', 'music', 'software', 'users', 'mobile', 'technology', 'people']




In [19]:
topic_values = lda.transform(tfidf_matrix)
doc_num, topic_num = topic_values.shape

In [20]:
import pandas as pd
df = pd.DataFrame({'document': corpus, 'label': labels, 'lda': topic_values.argmax(axis=1)})
df.groupby(['label', 'lda']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,document
label,lda,Unnamed: 2_level_1
business,0,5
business,1,480
business,2,1
business,3,15
business,4,9
entertainment,0,350
entertainment,1,10
entertainment,2,3
entertainment,3,11
entertainment,4,12


# Visualisation

In [67]:
prob_matrix = lda.transform(tfidf_matrix)

In [94]:
import pandas as pd

from bokeh.io import show
from bokeh.models import LinearColorMapper, BasicTicker, PrintfTickFormatter, ColorBar
from bokeh.plotting import figure
from bokeh.sampledata.unemployment1948 import data

prob_matrix_df = pd.DataFrame(data=prob_matrix[0:,0:], 
              index=list(range(len(prob_matrix))),
              columns=list(range(topic_num)))
prob_matrix_df

Unnamed: 0,0,1,2,3,4
0,0.025118,0.899510,0.025182,0.025091,0.025099
1,0.380475,0.547495,0.023812,0.024134,0.024084
2,0.025669,0.896890,0.026075,0.025574,0.025791
3,0.034627,0.834639,0.034424,0.034590,0.061720
4,0.044206,0.823558,0.043935,0.043844,0.044456
...,...,...,...,...,...
2220,0.030175,0.033565,0.044404,0.030370,0.861486
2221,0.035001,0.035894,0.392093,0.034688,0.502325
2222,0.033886,0.033625,0.033952,0.033222,0.865315
2223,0.036446,0.036475,0.036495,0.036324,0.854260


In [70]:
prob_matrix_df['doc'] = list(range(doc_num))
prob_matrix_df = prob_matrix_df.set_index('doc')
prob_matrix_df.columns.name = 'topic'
df = pd.DataFrame(prob_matrix_df.stack(), columns=['rate']).reset_index()

In [24]:
import random
rand = random.sample(range(1, 2555), 100)
df_rand = df.loc[df['doc'].isin(rand)]

df_rand.to_pickle("./df_rand.pkl")
df = pd.read_pickle("./df_rand.pkl")

df['doc'] = df['doc'].astype(str)
df['topic'] = df['topic'].astype(str)
df['rate'] = df['rate']*100

docs = list(set(df_rand['doc']))
docs.sort()
docs = [str(i) for i in docs]
docs;
topics = ['0','1','2','3','4']

In [25]:
import pandas as pd

from bokeh.io import show, output_file
from bokeh.models import LinearColorMapper, BasicTicker, PrintfTickFormatter, ColorBar
from bokeh.plotting import figure

output_file("lda_output_visualisation.html")

colors = ['#084594', '#2171b5', '#4292c6', '#6baed6', '#9ecae1', '#c6dbef', '#deebf7', '#f7fbff']

mapper = LinearColorMapper(palette=colors, low=df.rate.min(), high=df.rate.max())
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title="Topic probability per document",
           x_range=topics, y_range=docs,
           x_axis_location="above", plot_width=300, plot_height=800,
                      tools=TOOLS, toolbar_location='below',
           tooltips=[('rate', '@rate%')]
          )

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 3

p.rect(x="topic", y="doc", width=1, height=1,
       source=df,
       fill_color={'field': 'rate', 'transform': mapper},
       line_color=None)

color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="5pt",
                     ticker=BasicTicker(desired_num_ticks=len(colors)),
                     formatter=PrintfTickFormatter(format="%d%%"),
                     label_standoff=6, border_line_color=None, location=(0, 0))
p.add_layout(color_bar, 'right')

show(p)

### Distribution plots

In [100]:
import random
rand = random.sample(range(1, 105), 5)
x = [x1[i] for i in rand]
y = [y1[i] for i in rand]
from bokeh.io import show, output_file
from bokeh.plotting import figure

output_file("skewed_distributions.html")

def plot(x,y, title, xlabel, ylabel):
    p = figure(x_range=x, plot_width=200, plot_height=200, title=title,
               toolbar_location=None, tools="")

    p.vbar(x=x, top=y, width=0.9, line_color="white", fill_color="navy", alpha=0.5)

    p.xgrid.grid_line_color = None
    p.xaxis.major_label_orientation = 1.2
    p.y_range.start = 0
    p.xaxis.axis_label = xlabel
    p.yaxis.axis_label = ylabel
    p.yaxis.major_label_text_font_size = '0pt'
    return p

In [97]:
df_sample = prob_matrix_df.sample(1)

In [112]:
from bokeh.layouts import gridplot

output_file("sample_distrib.html")

p1 = plot(x,y,"Topic k for documents", "words", "𝜙_𝑘")
p2 = plot(['0','1','2','3','4'],df_sample.iloc[0].to_list(),"Document d for topics", "topics", "θ_d")
show(gridplot([p1,p2], ncols=2))