# Project Demo

This is a demo notebook for the NLP Group Project.

## Setup Code

### Download Files

In [None]:
# Update packages
# There might be some pip errors, but can be ignored.
!pip install -q -U matplotlib==3.1.3 nltk numpy pandas scikit-learn scipy torch 

import pathlib

# Models
pathlib.Path("/content/modelsaves").mkdir(parents=True, exist_ok=True)
# kmeans, lda, nvdm respectively
!gdown --id 1oqYBdnw8NeTx7m2wtOGx4-9jnspXMRci --output /content/modelsaves/kmeans.pickle
!gdown --id 1ytjlO4F3W_MuSOG0QqLr5cjQbN0Z92ri --output /content/modelsaves/lda.pickle
!gdown --id 1yBs969W7BNpekb7nlc9GUu4xsyEtTopr --output /content/modelsaves/nvdm.pt

# Data (S&P500 then bds1)
pathlib.Path("/content/data").mkdir(parents=True, exist_ok=True)
!gdown --id 16Uq_b8EtkTv_58upirW5oCTn63JFrMTa --output /content/data/sp500.zip
!unzip -q /content/data/sp500.zip -d /content/data/
!gdown --id 1Hhrv8F0HhP2hrL75nUQvwKuUQBXxxxrT --output /content/data/bds1.txt
!gdown --id 1DkicvKLHe92qvyimB75hb_LvP2MVyYOr --output "/content/data/S AND P.xlsx"

# NVDM definition
pathlib.Path("/content/models").mkdir(parents=True, exist_ok=True)
!curl https://raw.githubusercontent.com/aaronfng/nlp-group-project/main/nvdm/models/nvdm.py > /content/models/nvdm.py

[K     |████████████████████████████████| 13.1MB 304kB/s 
[K     |████████████████████████████████| 1.5MB 40.9MB/s 
[K     |████████████████████████████████| 15.3MB 232kB/s 
[K     |████████████████████████████████| 9.9MB 38.2MB/s 
[K     |████████████████████████████████| 22.3MB 2.1MB/s 
[K     |████████████████████████████████| 27.4MB 154kB/s 
[31mERROR: tensorflow 2.4.1 has requirement numpy~=1.19.2, but you'll have numpy 1.20.3 which is incompatible.[0m
[31mERROR: google-colab 1.0.0 has requirement pandas~=1.1.0; python_version >= "3.0", but you'll have pandas 1.2.4 which is incompatible.[0m
[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.[0m
[31mERROR: albumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.9 which is incompatible.[0m
[?25hDownloading...
From: https://drive.google.com/uc?id=1oqYBdnw8NeTx7m2wtOGx4-9jnspXMRci
To: /content/modelsaves/kmeans.pickle
100% 648k/6

### Imports

In [None]:
import os
import string
import pathlib
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans

# For preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")
nltk.download("wordnet")

import torch

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# NVDM definition
from models.nvdm import NVDM

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


### Preprocessing

In [None]:
# Instance of Lemmatizer
LEMMATIZER = WordNetLemmatizer()
STOPWORDS = stopwords.words('english')

def preprocess_text(text):
    """ Process a single line of text. """

    # Strip trailing characters if any (e.g. newline)
    text_new = text.strip()
    
    # Remove puncuation
    text_new = ''.join(ch for ch in text_new if ch not in string.punctuation)

    # Lower case
    text_new = text_new.lower()
    
    # Tokenise by space
    tokens = text_new.split()
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in STOPWORDS]

    # Lemmatise each word
    tokens = [LEMMATIZER.lemmatize(word) for word in tokens]
    
    text_new = ' '.join(tokens)

    return text_new

### Data-Loading Functions

In [None]:
def load_sp500(path, preprocess=False):
    """ Load S&P500 data from the per-company text files in the supplied directory path.
    
    Within the directory, each file is named "<ticker>_<sector>.txt".
    Each contains the business description (BD) of the company.
    
    If preprocess is True, the preprocess the business descriptions at the same time.
    """
    filenames = os.listdir(path)

    tickers = []
    sectors = []
    bds = []
    for fn in filenames:
        prefix = fn.split('.txt')[0]
        ticker, sector = prefix.split('_')
        filepath = os.path.join(path, fn)
        with open(filepath, 'r', encoding="utf8") as f:
            bd = f.read().strip()
        
        if preprocess:
            bd = preprocess_text(bd)

        tickers.append(ticker)
        sectors.append(sector)
        bds.append(bd)
    
    return tickers, sectors, bds

def load_bds1(path, preprocess=False, exclude_tickers=None):
    """ Load data from the business data, given the file path (e.g. "data/bds_1.txt").
    
    In the file, each company has two consecutive lines.
    The first line is <company ticker>:<CIK> (we only care about the ticker)
    and the second line is the company business description.
    
    exclude_tickers is a list of tickers that we want to ignore in bds_1.txt.
    For example, we can use this to exclude any S&P500 companies to avoid
    overlapping of datasets.
    """
    
    with open(path, "r", encoding="utf8") as f:
        lines = f.readlines()

    company_ids_all = [ln.strip() for ln in lines[0::2]]
    company_descriptions_all = [ln.strip() for ln in lines[1::2]]
    company_tickers = [x.split(':')[0] for x in company_ids_all]

    exclusion_set = set(exclude_tickers) if exclude_tickers is not None else set()

    tickers = []
    bds = []
    
    # Some business descriptions are too short (or even empty),
    # so we only keep those with a length (number of characters) deemed reasonable.
    bd_valid_length = 3000
    for ticker, bd in zip(company_tickers, company_descriptions_all):
        if ticker not in exclusion_set and len(bd) >= bd_valid_length:
            tickers.append(ticker)
            
            if preprocess:
                bd = preprocess_text(bd)
            bds.append(bd)
    
    return tickers, bds


def load_ticker_name(filepath):
    """ Load a dictionary that converts tickers to security names.
    
    e.g. ticker_to_name["MSFT"] = "Microsoft Corp."

    This improves interpretability in the interactive demo:
    we can see the actual company English name instead of just its symbol.
    """
    # Company information (ticker, name, industry sector etc.)
    company_df = pd.read_excel(filepath)
    ticker_names = company_df[['Symbol', 'Security']].to_dict(orient='records')
    ticker_to_name = dict()

    for item in ticker_names:
        ticker = item['Symbol']
        name = item['Security']

        ticker_to_name[ticker] = name
    
    return ticker_to_name

### Model Definitions

Below are wrappers for K-Means, LDA and NVDM. They load model that we have trained locally.

In [None]:
class ModelWrapper:
    """ Base class for clustering models.
    
    Basically a wrapper for a variety of models.
    """
    def __init__(self):
        pass
    
    def transform(self, X):
        """ Apply model to new data.
        
        Should output a topic-document matrix,
        where each element is a score indicating how likely the document
        should be assigned to the topic.
        For sklearn LDA, transform() does this by default.
        """
        raise NotImplementedError()

    @property
    def topic_vocab_matrix(self):
        """ Each model should be able to return a topic-vocab matrix
        containing a score (e.g. probability) of a word in the vocabulary
        occuring in the k^th topic. """
        pass

    
class KMeansModel(ModelWrapper):
    """ Wrapper for scikit-learn KMeans. """
    def __init__(self, path):
        with open(path, "rb") as f:
            self.model = pickle.load(f)

    def transform(self, X):
        """ Returns a topic-document matrix of distances per cluster. """
        # Negate because the model outputs distances per cluster.
        # Smaller distances mean more likely topic assignment,
        # so we want to negate that to be consistent with the other two models.
        return -1 * self.model.transform(X)
    
    @property
    def topic_vocab_matrix(self):
        """ Return K-Means clusters.
        
        ndarray of shape (num_topics, n_features)
        """
        return self.model.cluster_centers_


class LDAModel(ModelWrapper):
    """ Wrapper for scikit-learn LDA. """
    def __init__(self, path):
        with open(path, "rb") as f:
            self.model = pickle.load(f)
    
    def transform(self, X):
        """ Returns a topic-document matrix of probabilities. """
        return self.model.transform(X) 
    
    @property
    def topic_vocab_matrix(self):
        """ Gets the components_ attribute of LDA, normalized
        
        Quoting sklearn docs:
        Variational parameters for topic word distribution.
        Since the complete conditional for topic word distribution is a Dirichlet,
        components_[i, j] can be viewed as pseudocount that represents
        the number of times word j was assigned to topic i.
        It can also be viewed as distribution over the words for each topic after normalization:
        model.components_ / model.components_.sum(axis=1)[:, np.newaxis].
        """
        # return self.model.components_
        return self.model.components_ / self.model.components_.sum(axis=1)[:, np.newaxis]

class NVDMModel(ModelWrapper):
    """ PyTorch NVDM model.
    
    Loads a pretrained model from disk.
    """
    def __init__(self, model_path):
        model = torch.load(model_path, map_location=torch.device('cpu'))        
        model.device = "cpu"
        model = model.cpu()
        model.eval()
        
        self.model = model

        decoder = self.model.decoder[0]
        weights = decoder.weight.data.detach().clone().cpu().numpy()
        self.topic_vocab = weights.T
    
    def transform(self, X):
        """ Output a topic-document matrix. """
        n_doc, n_vocab = X.shape
        n_topic = self.topic_vocab.shape[0]
        
        # shape (n_doc, n_topic)
        # Score of each document for a topic is the average scores
        # of the document's words in the topic.
        topic_doc = X @ self.topic_vocab.T
        
        # Optionally, normalize by document length.
        topic_doc = topic_doc / X.sum(axis=1, keepdims=True)
        
        return topic_doc
    
    @property
    def topic_vocab_matrix(self):
        """ Returns the learned semantic embeddings of each word. """
        return self.topic_vocab


### Data setup

In [None]:
# LOAD DATA + PREPROCESSING
# (this can take minutes because preprocessing is slow.)
sp500_tickers, sp500_sectors, sp500_bds = load_sp500("/content/data/SP500", preprocess=True)
bds1_tickers, bds1_bds = load_bds1("/content/data/bds1.txt", preprocess=True, exclude_tickers=sp500_tickers)

In [None]:
# FEATURE EXTRACTION
# Use this to convert text to a Bag-of-Words (term-frequency) representation.
# We used the larger BDS1 dataset as the training set when we trained our models,
# so we fit the CountVectorizer to BDS1 instead of S&P500.
n_features = 4000
tf_vectorizer = CountVectorizer(max_features=n_features, max_df=0.95, min_df=2)
tf_vectorizer.fit(bds1_bds)

CountVectorizer(max_df=0.95, max_features=4000, min_df=2)

In [None]:
# FEATURE EXTRACTION 2
# Convert SP500 data to bag-of-words representations.
X_sp500 = tf_vectorizer.transform(sp500_bds).toarray()
X_sp500.shape

(503, 4000)

In [None]:
# LOAD MODELS
# (Pickle might give a UserWarning due to scikit-learn version differences, but can be safely ignored.)
kmeans = KMeansModel("/content/modelsaves/kmeans.pickle")
lda = LDAModel("/content/modelsaves/lda.pickle")
nvdm = NVDMModel("/content/modelsaves/nvdm.pt")

# Print model details
print(type(kmeans.model))
print(type(lda.model))
print(nvdm.model)

<class 'sklearn.cluster._kmeans.KMeans'>
<class 'sklearn.decomposition._lda.LatentDirichletAllocation'>
NVDM(
  (embed_bow): EmbeddingBag(4000, 4000, mode=sum)
  (encoder): Sequential(
    (0): Linear(in_features=4000, out_features=250, bias=True)
    (1): Tanh()
    (2): Linear(in_features=250, out_features=250, bias=True)
    (3): Tanh()
  )
  (mu): Linear(in_features=250, out_features=20, bias=True)
  (log_sigma): Linear(in_features=250, out_features=20, bias=True)
  (decoder): Sequential(
    (0): Linear(in_features=20, out_features=4000, bias=True)
  )
)




## Interactive Demo

In this section, we allow the user to input custom test data.

When prompted do the following:

1. Choose the model (K-Means, LDA or NVDM) in the dropdown menu.
2. Input an arbitrary business description in English into the textbox. For example, you can choose any company and copy-and-paste
the introductory paragraph(s) from its Wikipedia entry.
3. Click the button to run topic assignment.

The demo will choose the best topic assignment according to the model. It will output a handful of S&P500 companies that were also assigned to the same topic.

As a starting point, you can use the following Wikipedia description of [Novartis](https://en.wikipedia.org/wiki/Novartis), an arbitrarily chosen pharmaceutical company:

> Novartis International AG is a Swiss multinational pharmaceutical company based in Basel, Switzerland. It is one of the largest pharmaceutical companies in the world.
Novartis manufactures the drugs clozapine (Clozaril), diclofenac (Voltaren)(sold to Glaxo Smith Kline in 2015 deal), carbamazepine (Tegretol), valsartan (Diovan), imatinib mesylate (Gleevec/Glivec), cyclosporine (Neoral/Sandimmune), letrozole (Femara), methylphenidate (Ritalin) (production ceased 2020), terbinafine (Lamisil), deferasirox (Exjade), and others.
In March 1996, Ciba-Geigy merged with Sandoz; the pharmaceutical and agrochemical divisions of both companies formed Novartis as an independent entity. Other Ciba-Geigy and Sandoz businesses were sold, or, like Ciba Specialty Chemicals, spun off as independent companies. The Sandoz brand disappeared for three years, but was revived in 2003 when Novartis consolidated its generic drugs businesses into a single subsidiary and named it Sandoz. Novartis divested its agrochemical and genetically modified crops business in 2000 with the spinout of Syngenta in partnership with AstraZeneca, which also divested its agrochemical business.
Novartis is a full member of the European Federation of Pharmaceutical Industries and Associations (EFPIA),[4] the International Federation of Pharmaceutical Manufacturers and Associations (IFPMA),[5] and the Pharmaceutical Research and Manufacturers of America (PhRMA).[6]

In [None]:
import ipywidgets as widgets

In [None]:
# Dictionary that converts symbols to names for better readability
ticker_to_name = load_ticker_name("/content/data/S AND P.xlsx")

def interactive_demo(model: ModelWrapper, text: str):
    """ Run an interactive demo session with the given model. """

    test_bd = text
    test_bd = preprocess_text(test_bd)
    X_test = tf_vectorizer.transform([test_bd]).toarray()

    test_scores = model.transform(X_test)

    test_topic = test_scores.argmax()

    # Compute best topic assignments for all companies in S&P500
    all_scores = model.transform(X_sp500)
    topic_per_company = all_scores.argmax(axis=1)
    groups = dict((i, []) for i in range(len(topic_per_company)))
    for company_idx in range(len(topic_per_company)):
        topic = topic_per_company[company_idx]
        score = all_scores[company_idx, topic]
        groups[topic].append((company_idx, score))

    # Print S&P500 companies that are in the custom company's group (topic assignment)
    # Sort by relevance score as well
    results = []
    group = groups[test_topic]
    group.sort(key=lambda x: -x[1])
    for item in group:
        i, score = item
        results.append((sp500_tickers[i], ticker_to_name.get(sp500_tickers[i]), sp500_sectors[i]))
    
    # Put it in a pandas dataframe just because printing looks better
    df = pd.DataFrame.from_records(results, columns=["Symbol", "Name", "Sector"])
    return test_topic, df

In [None]:
# GUI definitions

dropdown_model1 = widgets.Dropdown(
    options=[('K-Means', kmeans), ('LDA', lda), ('NVDM', nvdm)],
    value=lda,
    description='Model:'
)
text_input1 = widgets.Textarea(
    value='',
    placeholder='Type business description here...',
    description='Description:',
    disabled=False,
    layout={'width': '50%'}
)
button1 = widgets.Button(
    description='Click here to run!',
    layout={'border': 'solid'}
)

text_output1 = widgets.Output()

def button_callback1(x):
    """ Run the demo when button is clicked. """
    topic, df = interactive_demo(dropdown_model1.value, text_input1.value)
    text_output1.clear_output()
    with text_output1:
        print(f"Your company was assigned to topic {topic}.")
        print("Other companies with the same topic assignment:")
        print(df.head(20))

button1.on_click(button_callback1)
widgets.VBox((dropdown_model1, text_input1, button1, text_output1))

### INTERACTIVE MENU WILL SHOW UP BELOW

VBox(children=(Dropdown(description='Model:', index=1, options=(('K-Means', <__main__.KMeansModel object at 0x…

## Visualise Topics

In this section you can visualise the learnt topics for each model.
We plot each topic and its corresponding most important words.

Below, you can choose the model in the dropdown and the top number of words to show per topic. It may take a few seconds to run after clicking the Run button.

In [None]:
def plot_top_words(topic_vocab, feature_names, n_top_words, title):
    """ Given a topic-vocabulary matrix containing scores
    (e.g. probabilities, higher the better),
    plot the top words as a frequency bar-graph for each topic.
    
    e.g. set topic_vocab=model._components for LDA.
    """
    K = len(topic_vocab)
    n_x = 5
    n_y = int(np.ceil(K / n_x))
    fig, axes = plt.subplots(n_y, n_x, figsize=(2.5 * n_x, 4 * n_y), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(topic_vocab):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 14})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=12)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        #fig.suptitle(title, fontsize=20)
    
    fig.tight_layout()
    plt.show()

In [None]:
# GUI definitions

dropdown_model2 = widgets.Dropdown(
    options=[('K-Means', kmeans), ('LDA', lda), ('NVDM', nvdm)],
    value=lda,
    description='Model:'
)

dropdown_topn = widgets.Dropdown(
    options=[5, 10, 15, 20],
    value=10,
    description='Top N Words:'
)

button2 = widgets.Button(
    description='Click here to run!',
    layout={'border': 'solid'}
)

text_output2 = widgets.Output()

def button_callback2(x):
    """ Run the demo when button is clicked. """
    model = dropdown_model2.value
    n = dropdown_topn.value
    text_output2.clear_output()
    with text_output2:
        plot_top_words(model.topic_vocab_matrix, tf_vectorizer.get_feature_names(), n, "K-Means")

button2.on_click(button_callback2)
widgets.VBox((dropdown_model2, dropdown_topn, button2, text_output2))

### INTERACTIVE MENU WILL SHOW UP BELOW
# NOTE: each run could take a few seconds.

VBox(children=(Dropdown(description='Model:', index=1, options=(('K-Means', <__main__.KMeansModel object at 0x…