# nlp_overview.ipynb

# 0. Comments

Date: 01.12.21

Author: Barry Trim

Overview: Overview of NLP 


Commments
- 01.12.21: Initial Script



# 1. Set-up

### 1.1 Import Statements

In [1]:
# Importing Python Libraries to use in the code

####################################################################################################
# General modules

import pandas as pd
import numpy as np
import math
import random
import itertools

####################################################################################################


####################################################################################################
# System Interaction modules

import re
import os
import sys

####################################################################################################


####################################################################################################
# Plotting Libraries

import matplotlib.pyplot as plt
import matplotlib as mpl # new
import seaborn as sns #
sns.set(style="ticks") # Set Seaborn formatting style
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

####################################################################################################


####################################################################################################
# Datetime functions

import time
import datetime
from dateutil.parser import parse

####################################################################################################


####################################################################################################
# ML modules #######################################################################################

# Machine Learning Libraries - Sckit Learn

# Datasets
from sklearn.datasets import make_regression, fetch_20newsgroups


####################################################################################################


####################################################################################################
# NLP Modules

# nltk - NLP
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
stemmer = SnowballStemmer(language='english')

# vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
vader_sentiment = SentimentIntensityAnalyzer()

# spaCy - NLP
import spacy
import en_core_web_sm
spacy_en = spacy.load('en_core_web_sm')


# SentenceTransformer - NLP
from sentence_transformers import SentenceTransformer # Embeddings
import umap # Dimenstionality Reduction
import hdbscan # Clustering

####################################################################################################




### 1.1 Config

In [2]:
# Set-up project folders


# Set variables
var_sep = os.sep # Operating System directory separator
var_current_dir = os.getcwd() # find current directory
var_current_dir = var_current_dir

    
# Configure folders
var_folder_input = (var_current_dir+var_sep+"input"+var_sep)
var_folder_output = (var_current_dir+var_sep+"output"+var_sep)
var_folder_scripts = (var_current_dir+var_sep+"scripts"+var_sep)
var_folder_config = (var_current_dir+var_sep+"config"+var_sep)


# 2. Functions

# 3.0 Code

### 3.1 Import Data

In [3]:
# import news dataset
data_pp = fetch_20newsgroups(subset='all')['data']


In [4]:
# Number of elements in corpus
var_doc_len = len(data_pp)

# Word Count - for infor
var_text = ' '.join(data_pp)
var_text_list = var_text.split(' ')
var_word_cnt = len(var_text_list)

print('Number of documents: ', var_doc_len)
print('Number of words is:  ', var_word_cnt)
print('Avg words per doc:   ', round(var_word_cnt/var_doc_len, 2))


Number of documents:  18846
Number of words is:   5937230
Avg words per doc:    315.04


In [5]:
# Filter data and unpack fields
arr_data = pd.DataFrame(data_pp)
arr_data.columns = ['text_orig']
arr_data['index_num'] = arr_data.index

# Import target data
data_target = fetch_20newsgroups(subset='all')['target']
arr_data['target'] = data_target


In [6]:
# Add target names
data_target_names = fetch_20newsgroups(subset='all')['target_names']
arr_target_names = pd.DataFrame(data_target_names)
arr_target_names.columns = ['target_names']
arr_target_names['target'] = arr_target_names.index
arr_data = pd.merge(arr_data, arr_target_names, on=['target'])
arr_data.sort_values(by=['index_num'], inplace=True)
arr_data.reset_index(drop=True, inplace=True)

In [7]:
# Display Data
arr_data


Unnamed: 0,text_orig,index_num,target,target_names
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...,0,10,rec.sport.hockey
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...,1,3,comp.sys.ibm.pc.hardware
2,From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject...,2,17,talk.politics.mideast
3,From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...,3,3,comp.sys.ibm.pc.hardware
4,From: Alexander Samuel McDiarmid <am2o+@andrew...,4,4,comp.sys.mac.hardware
...,...,...,...,...
18841,From: jim.zisfein@factory.com (Jim Zisfein) \n...,18841,13,sci.med
18842,From: rdell@cbnewsf.cb.att.com (richard.b.dell...,18842,12,sci.electronics
18843,From: westes@netcom.com (Will Estes)\nSubject:...,18843,3,comp.sys.ibm.pc.hardware
18844,From: steve@hcrlgw (Steven Collins)\nSubject: ...,18844,1,comp.graphics


In [8]:
# Display Data
print(arr_data.at[0, 'text_orig'])

From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




In [9]:
# Take the first 1000 records of the data
arr_data = arr_data.head(1000)


### 3.3 Remove Stop Characthers and email addresses

In [None]:
# Remove stop chars, 
# control characthers needed esaping '\'
lst_stop_chars = ['<', '>', '-', '_', '\|', ':', '\*', '\^']
arr_data['text_new'] = arr_data[['text_orig']].replace(regex=lst_stop_chars, value='')

# Remove all email addresses
arr_data['text_new'] = (arr_data[['text_new']].replace(regex='\S*@\S*\s?', value=''))

# Remove all URLs - needs to be refined
arr_data['text_new'] = (arr_data[['text_new']].replace(regex='https?://\S+', value=''))


In [11]:
var_num = 777

print('******************* BREAK contents_replace ****************************')
print('')
print(arr_data.at[var_num, 'text_orig'])
print('')
print('******************* BREAK contents_replace_email****************************')
print('')
print(arr_data.at[var_num, 'text_new'])


******************* BREAK contents_replace ****************************

From: gregof@JSP.UMontreal.CA (Grego Filippo)
Subject: Info wanted on Tseng Labs ET4000 VLB
Organization: Universite de Montreal
Lines: 9

Hi fellow netters,

does anybody have any info on Tseng Labs ET4000 VLB card:
price, speed, compatibility with existing and up-comming softwares,
performance compared to others cards ( is it an S3 based card ?)....

Thank you..




******************* BREAK contents_replace_email****************************

From (Grego Filippo)
Subject Info wanted on Tseng Labs ET4000 VLB
Organization Universite de Montreal
Lines 9

Hi fellow netters,

does anybody have any info on Tseng Labs ET4000 VLB card
price, speed, compatibility with existing and upcomming softwares,
performance compared to others cards ( is it an S3 based card ?)....

Thank you..





### 3.4 POS Tagging

In [12]:
# Get some text

var_text = """Arup is a British multinational professional services firm 
headquartered in London which provides design, engineering, architecture, 
planning, and advisory services across every aspect of the built environment. 
The firm employs approximately 16,000 staff in over 90 offices across 35 
countries around the world."""

# Get text from newsdata
# var_text = arr_data.at[30, 'text_new']

print(var_text)

Arup is a British multinational professional services firm 
headquartered in London which provides design, engineering, architecture, 
planning, and advisory services across every aspect of the built environment. 
The firm employs approximately 16,000 staff in over 90 offices across 35 
countries around the world.


In [13]:
# Get POS and Lemmas etc
doc = spacy_en(var_text)

for token in doc:
    print(token, token.lemma_, token.pos_)
    # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

Arup arup NOUN
is be AUX
a a DET
British british ADJ
multinational multinational ADJ
professional professional ADJ
services service NOUN
firm firm NOUN

 
 SPACE
headquartered headquarter VERB
in in ADP
London London PROPN
which which PRON
provides provide VERB
design design NOUN
, , PUNCT
engineering engineering NOUN
, , PUNCT
architecture architecture NOUN
, , PUNCT

 
 SPACE
planning planning NOUN
, , PUNCT
and and CCONJ
advisory advisory ADJ
services service NOUN
across across ADP
every every DET
aspect aspect NOUN
of of ADP
the the DET
built build VERB
environment environment NOUN
. . PUNCT

 
 SPACE
The the DET
firm firm NOUN
employs employ VERB
approximately approximately ADV
16,000 16,000 NUM
staff staff NOUN
in in ADP
over over ADP
90 90 NUM
offices office NOUN
across across ADP
35 35 NUM

 
 SPACE
countries country NOUN
around around ADP
the the DET
world world NOUN
. . PUNCT


In [14]:
# Define POS elements to include
lst_pos = ['NOUN', 'VERB', 'DET', 'PUNCT']

# Run the filter
var_text_new = [x.lemma_ for x in doc if x.pos_ in lst_pos]

# Join the list into a string
print(' '.join(var_text_new))

arup a service firm headquarter provide design , engineering , architecture , planning , service every aspect the build environment . the firm employ staff office country the world .


In [15]:
# Find noun phrases (noun chunks) in the text
for chunk in doc.noun_chunks:
    print(chunk.text)

Arup
a British multinational professional services firm
London
which
design
engineering
architecture
planning
advisory services
every aspect
the built environment
The firm
approximately 16,000 staff
over 90 offices
35 
countries
the world


### 3.5 Sentiment

Vader (the NLTK Sentiment Library) works well on social media comments and reviews which are short and can contain emojis. Consider breaking up large documents into smaller documents for sentiment scoring.

In [16]:
# Simple example of sentiment scoring example

var_text_positive = 'I love everyone and am overjoyed'
var_text_negative = 'I absolutely hate everyone and am very bitter'

print('positive: ', vader_sentiment.polarity_scores(var_text_positive))
print('positive compound: ', vader_sentiment.polarity_scores(var_text_positive)['compound'])
print()

print('negative: ', vader_sentiment.polarity_scores(var_text_negative))
print('negative compound: ', vader_sentiment.polarity_scores(var_text_negative)['compound'])



positive:  {'neg': 0.0, 'neu': 0.336, 'pos': 0.664, 'compound': 0.836}
positive compound:  0.836

negative:  {'neg': 0.541, 'neu': 0.459, 'pos': 0.0, 'compound': -0.7956}
negative compound:  -0.7956


### 3.6 Embedding, Doc Clustering and Topic Modelling

Text Embedding is converting ‘Text’ to numerical vectors which represent the words. Text Embedding causes models to function better as embedding models (such as BERT) can take into account content indicate similarity of meaning between two different words (e.g. ‘car’ and ‘automobile’ meaning the same thing.


In [17]:
# Text Embedding

# Define the model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Generate the embeddings
lst_embeddings = model.encode(arr_data['text_new'], show_progress_bar=True)

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=32.0), HTML(value='')))




In [18]:
# View embedding
print('Embedding Length: ', len(lst_embeddings[0]))
print('')
print(lst_embeddings[0])


Embedding Length:  768

[ 3.11651736e-01 -4.15521234e-01  4.59602594e-01 -1.21574461e-01
 -7.82123744e-01 -1.30180627e-01 -1.68112531e-01 -6.15227759e-01
  7.42287636e-01  3.71210963e-01  2.18754858e-01  5.41139901e-01
 -5.09341657e-01  1.11398935e+00  4.44403321e-01 -5.20721257e-01
  4.19572711e-01 -1.01445481e-01 -2.66003549e-01  2.55932450e-01
 -7.73899972e-01  4.10386741e-01  5.92998147e-01  9.32648063e-01
 -9.18336630e-01 -5.03186956e-02  4.65151757e-01 -4.28315967e-01
  4.63159472e-01  5.07030964e-01  5.78824937e-01  4.04971987e-01
 -5.70110261e-01 -1.81191087e-01  3.62696826e-01 -1.44147411e-01
  5.43858767e-01  2.34094337e-01  1.07747352e+00 -3.51341546e-01
 -2.24507377e-01 -8.55968595e-01 -1.86308119e-02  7.46259391e-01
  3.64485919e-01 -6.22368336e-01  5.58727562e-01 -1.28041700e-01
 -6.42184615e-01  8.83252770e-02  3.57173562e-01  2.69428760e-01
  2.05855578e-01  4.11779433e-02  1.55159205e-01 -4.73711312e-01
 -5.31442165e-02 -5.13362169e-01  6.85164750e-01  4.30066913e-01
 

In [19]:
# Dimensionality Reduction
# parameters need tuning for optimal results

# Dimensionality Reduction - for clustering (10 dimensions)
# n_components determines the number of dimensions in the output vector
umap_embeddings = umap.UMAP(n_neighbors=20, n_components=10, metric='cosine', random_state=1).fit_transform(lst_embeddings)

# Dimensionality Reduction - for plotting (2 dimensions)
umap_embeddings_plot = umap.UMAP(n_neighbors=20, n_components=2, metric='cosine', random_state=1).fit_transform(lst_embeddings)


In [20]:
# View example of umap_embeddings
print('Embedding Length: ', len(umap_embeddings[0]))
print('')
print(umap_embeddings[0])


Embedding Length:  10

[4.747404  6.2907195 8.761942  7.7304115 4.0579996 6.548015  5.444745
 4.3544    4.0735064 5.865413 ]


In [21]:
# Cluster the documents
cluster = hdbscan.HDBSCAN(min_cluster_size=20, metric ='euclidean', cluster_selection_method='eom').fit(umap_embeddings)


In [22]:
# Print the diffent number of clusters
np.unique(cluster.labels_)


array([-1,  0,  1,  2])

In [23]:
# Clustering probabilites
cluster.probabilities_


array([0.8356093 , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.998292  , 1.        , 1.        ,
       0.95309127, 0.7969983 , 1.        , 1.        , 0.76346394,
       0.93221751, 1.        , 0.        , 0.        , 0.        ,
       1.        , 0.94990749, 0.98331318, 0.        , 1.        ,
       1.        , 0.66825979, 0.        , 1.        , 1.        ,
       0.        , 0.985194  , 1.        , 0.        , 0.        ,
       1.        , 1.        , 0.84675549, 1.        , 0.908934  ,
       0.88097526, 1.        , 0.97387638, 0.        , 1.        ,
       1.        , 1.        , 0.        , 0.79040183, 0.93804263,
       1.        , 1.        , 0.        , 1.        , 0.        ,
       0.99802888, 0.        , 1.        , 0.96978327, 1.        ,
       0.88991694, 1.        , 0.9906081 , 0.91149776, 0.86532684,
       0.        , 0.97422937, 0.99976243, 1.        , 1.        ,
       1.        , 0.80980246, 1.        , 1.        , 1.     

In [None]:
# Add cluster ID and probability to df
arr_data['cluster_id'] = list(cluster.labels_)
arr_data['cluster_prob'] = list(cluster.probabilities_)

# Add plotting values
arr_data['x'] = [x[0] for x in umap_embeddings_plot]
arr_data['y'] = [x[1] for x in umap_embeddings_plot]


### 3.7 Plotting

In [25]:
# Add plotting colours to the df
lst_colors = (px.colors.qualitative.Dark24 + px.colors.qualitative.Light24 + px.colors.qualitative.Alphabet) * 15
lst_colors = ['#D2D2D2'] + lst_colors # Add light colour for Topic -1 (unmapped)
arr_colours = pd.DataFrame(lst_colors)
arr_colours['cluster_id'] = arr_colours.index - 1
arr_colours.columns = ['colour', 'cluster_id']
arr_data = pd.merge(arr_data, arr_colours, on=['cluster_id'])


In [26]:
# Generate standalone html plot of the topic model using plotly

# Format hover text
arr_data['text_new_plot'] = arr_data['text_new'].str.wrap(100)
arr_data['text_new_plot'] = arr_data['text_new_plot'].apply(lambda x: x.replace('\n', '<br>')) # replace with hmtl line break


# Generate plot
fig_scatter = go.Figure(data=go.Scattergl(x = arr_data['x'], y = arr_data['y'],
                                text = arr_data['cluster_id'],   
                                hovertext = arr_data['text_new_plot'],
                                textposition='bottom center',
                                mode = 'markers',
                                name = '',
                                hovertemplate = "<b>Topic:</b> %{text}" + "<br><b>Text:</b> %{hovertext}",
                                marker=dict(size=10, opacity=0.5, color='rgba(0,0,0,0)', line=dict(color=arr_data['colour'], width=2)
                                            ) 
                               )
               )


fig_scatter.update_layout(autosize=True,
                    title={'text':'ScatterPlot',
                           'y':0.95, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
                    margin=go.layout.Margin(l=20, r=20, b=20, t=100, pad=10),
                    xaxis = {'showgrid': False, 'zeroline': False, 'visible': False},
                    yaxis = {'showgrid': False, 'zeroline': False, 'visible': False},

    )


# Save the plot the figure
date_time_now = datetime.datetime.now().strftime('%y%m%d_%H%M%S')
plot(fig_scatter, filename = var_folder_output + date_time_now + '_topic_model.html', auto_open=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'/Users/barry.trim/Documents/03_vs_code/08_python/nlp_overview/src/output/211201_160634_topic_model.html'