<a href="https://colab.research.google.com/github/ajaykrishna2013/NLP/blob/main/w3_Sync_Session.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython.core.interactiveshell import InteractiveShell as IS; IS.ast_node_interactivity = "all"
!pip -q install contractions
import sys, matplotlib.pylab as plt, re, platform, matplotlib, pprint, time
import numpy as np, pandas as pd, nltk, sklearn, spacy, unicodedata, contractions 
from collections import Counter
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import brown
from nltk.corpus import wordnet
tmp = nltk.download(['brown', 'stopwords','punkt','wordnet', 'gutenberg'], quiet=True) # See https://www.nltk.org/book/ch02.html

# Increase viewable area of Pandas tables, numpy arrays, plots
pd.set_option('max_rows', 5, 'max_columns', 500, 'max_colwidth', 1, 'precision', 2)
np.set_printoptions(linewidth=10000, precision=4, edgeitems=20, suppress=True)
plt.rcParams['figure.figsize'] = [16, 4]

def LoadNews(cat=['sci.space'], TopN=100):
    '''Function to load a string of news posts for the specified categories. Returns: TopN concatenated news'''
    Rem = ('headers', 'footers', 'quotes')   # remove these fields from result set
    bunch = fetch_20newsgroups(categories=cat, subset='test', shuffle=False, remove=Rem)
    return '\n'.join(bunch.data[:TopN])  # save first 100 posts concatenated as a single string.

# See doc: https://scikit-learn.org/stable/datasets/index.html#newsgroups-dataset
# We preload string variables containing concatenated news posts 
sNews = LoadNews(['comp.graphics'])   # news from space
LsTgtNames = list(fetch_20newsgroups().target_names)   # names of 20 newsgroups
LsStopwords = nltk.corpus.stopwords.words('english')

pso = nltk.stem.PorterStemmer()       # instantiates Porter Stemmer object
wlo = nltk.stem.WordNetLemmatizer()   # instantiates WordNet lemmatizer object
SsBrownVcb = set(brown.words())       # Vocabulary of 56057 words in Brown Corpus

# store sentence tokenizers' results as a list of lists
nlp = spacy.load('en_core_web_sm')
st =  [sNews.split('. ')] \
    + [nltk.sent_tokenize(sNews)] \
    + [nltk.tokenize.PunktSentenceTokenizer().tokenize(sNews)] \
    + [[s.text for s in list(nlp(sNews).sents)]]

# store word tokenizers' results as a list of lists
wt = [sNews.split()] \
    + [nltk.RegexpTokenizer(pattern=r"\s+", gaps=True ).tokenize(sNews)] \
    + [nltk.RegexpTokenizer(pattern=r"\s+", gaps=True ).tokenize(sNews)] \
    + [nltk.WhitespaceTokenizer().tokenize(sNews)] \
    + [nltk.RegexpTokenizer(pattern=r"\w+", gaps=False).tokenize(sNews)] \
    + [nltk.word_tokenize(sNews)] \
    + [nltk.TreebankWordTokenizer().tokenize(sNews)] \
    + [[t.text for t in nlp(sNews)]] \
    + [nltk.tokenize.toktok.ToktokTokenizer().tokenize(sNews)] \
    + [nltk.WordPunctTokenizer().tokenize(sNews)]

In [None]:
print(fetch_20newsgroups().DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features       

In [None]:
sNews # unformatted



In [None]:
print(sNews[:250])  # formatted

Who has experience with porting a GL-program to an Alpha APX  
workstation with Kubota's Denali 3D-Graphic.
Is there any problems?
Is the real graphic-performance like a SGI R4000 Indigo XS24Z?

The OTIS Project presents...

       SYNERGY: Revolt 
	


# **Word Tokenization**

In [None]:
t0 = time.time()
LsWordsWS = sNews.split()   # fast split on whitespace character \s = {' ', \t, \n, \r}
print(f'{time.time() - t0:.3f}sec', LsWordsWS)
t0 = time.time()
LsWordsSpacy = [t.text for t in nlp(sNews)]   # slow SpaCy's tokenization
print(f'{time.time() - t0:.3f}sec', LsWordsSpacy)



In [None]:
print(f'#tokens: {len(LsWordsWS)}, #types (unique tokens): {len(set(LsWordsWS))}')
print(f'#tokens: {len(LsWordsSpacy)}, #types (unique tokens): {len(set(LsWordsSpacy))}')

#tokens: 17507, #types (unique tokens): 6139
#tokens: 25838, #types (unique tokens): 5005


In [None]:
print(Counter(LsWordsWS).most_common(20))  # many stop words appear at the top
print(Counter(LsWordsSpacy).most_common(20))

[('the', 490), ('and', 474), ('to', 385), ('of', 361), ('a', 358), ('is', 225), ('for', 202), ('I', 178), ('in', 175), ('be', 141), ('with', 130), ('or', 127), ('on', 125), ('that', 123), ('it', 102), ('are', 95), ('image', 93), ('have', 92), ('can', 85), ('The', 79)]
[(',', 1055), ('.', 868), ('\n', 763), ('_', 725), ('\n  ', 560), ('the', 493), ('=', 488), ('and', 478), (' ', 446), ('to', 393), ('-', 368), ('of', 365), ('a', 361), ('\n\n', 265), ('/', 253), (')', 242), ('is', 234), ('(', 226), ('I', 223), (':', 219)]


In [None]:
print(Counter(LsWordsWS).most_common()[-20:])    # we start seeing problem with tokenization
print(Counter(LsWordsSpacy).most_common()[-20:]) 

[('okay,', 1), ('n-sided', 1), ('efficient)', 1), ('indices', 1), ('comprise', 1), ('once;', 1), ('backface', 1), ('elimination', 1), ('polygon,', 1), ('belong', 1), ('bacfacing', 1), ('polys', 1), ('transformed', 1), ('Whenever', 1), ('vertex,', 1), ('hither', 1), ('plane;', 1), ('result.', 1), ('clip;', 1), ('stage.', 1)]
[('Illusions', 1), ('amazing', 1), ('Go', 1), ('PCMag', 1), ('picked', 1), ('deleted', 1), ('agreement', 1), ('Here', 1), ('Keep', 1), ('sided', 1), ('indices', 1), ('comprise', 1), ('backface', 1), ('elimination', 1), ('belong', 1), ('bacfacing', 1), ('polys', 1), ('transformed', 1), ('Whenever', 1), ('hither', 1)]


In [None]:
print([s for s in LsWordsWS if 'once' in s.lower()])
print([s for s in LsWordsWS if 'plane' in s.lower()])
print([s for s in LsWordsWS if 'clip' in s.lower()])
print([s for s in LsWordsWS if 'stage' in s.lower()])

['Concepts,', 'Once', 'once', 'once', 'Concept', 'Concept', 'Concept', 'Concept', 'Concept', 'Once', 'concept', 'Concerning', 'once;']
['planet', 'plane.', 'planes.', 'image-plane.', 'rule/plane', 'plane;']
['CLIPS', 'clipart', 'clip', 'clipart?', 'clip', 'clip', 'clip', 'clipart', 'clip', 'clip;']
['stages', 'stage', 'stage', 'Stage', 'third-stage', 'second-stage', 'Stage', 'stage', 'stage', 'stage', 'stage.']


In [None]:
# Whitespace-separated tokens of mixed letters and non-letters
LsWordsWithPunct = [s for s in LsWordsWS if re.findall('\W+', s) and re.findall('\w+', s)]
print(LsWordsWithPunct)
print(f'Tokens with special symbols: {len(LsWordsWithPunct)}, fraction of all tokens: {len(LsWordsWithPunct)/len(LsWordsWS):.3}, unique: {len(set(LsWordsWithPunct))}')

Tokens with special symbols: 3412, fraction of all tokens: 0.195, unique: 2568


In [None]:
# Spacy's tokens of mixed letters and non-letters
LsWordsWithPunct = [s for s in LsWordsSpacy if re.findall('\W+', s) and re.findall('\w+', s)]
print(LsWordsWithPunct)
print(f'Tokens with special symbols: {len(LsWordsWithPunct)}, fraction of all tokens: {len(LsWordsWithPunct)/len(LsWordsSpacy):.3}, unique: {len(set(LsWordsWithPunct))}')

["'s", 'partici-', 'scan-', "'ve", '/photocopy', '8bit/16bit/24bit/', 'mad-celt@cwis.unomaha.edu', 'sunsite.unc.edu', '/pub', '141.214.4.135', "n't", "n't", "n't", "'ll", 'swdsrv.edvz.univie.ac.at', '/pc', '-rw', 'r--', '18:00', 'uvesa31.zip', 'plaza.aarnet.edu.au', '/micros', '-r', 'r--', '00:00', 'uvesa31.zip', '/micros', '-r', 'r--', '19:00', 'uvesa31.zip', 'godzilla.cgl.rmit.oz.au', '/kjb', '-rw', 'r--', '15:03', 'uvesa32.zip', 'nic.switch.ch', '/mirror', '-rw', 'r--', '20:00', 'uvesa31.zip', '/software', '-rw', 'r--', '20:00', 'uvesa31.zip', 'ipc1.rvs.uni', 'hannover.de', '/pub', '-rw', 'r--', '17:08', 'uvesa31.zip', 'sun0.urz.uni', 'heidelberg.de', '/pub', '-rw', 'r--', '19:00', 'uvesa31.zip', 'athene.uni-paderborn.de', '/pcsoft', '-rw', 'r--', '18:00', 'uvesa31.zip', 'compute1.cc.ncsu.edu', '/mirrors', '-rw', 'r--', '19:00', 'uvesa31.zip', 'rigel.acs.oakland.edu', '/pub', '-rw', 'r--', '19:00', 'uvesa31.zip', 'pc.usl.edu', '/pub', 'video.and.graphics', '-rw', 'r--', '10:41', 'uv

In [None]:
# Another way to measure the quality of resulting word tokens
LsWordsWithPunct2 = [s for s in LsWords if s not in SsBrownVcb]
print(LsWordsWithPunct2)
print(f'Tokens with special symbols: {len(LsWordsWithPunct2)}, fraction of all tokens: {len(LsWordsWithPunct2)/len(LsWords):.3}, unique: {len(set(LsWordsWithPunct2))}')

NameError: ignored

In [None]:
# Measure the quality of resulting (lower-cased) word tokens
LsWordsWithPunct2 = [s for s in LsWords if s.lower() not in SsBrownVcb] 
print(LsWordsWithPunct2) 
print(f'Tokens with special symbols: {len(LsWordsWithPunct2)}, fraction of all tokens: {len(LsWordsWithPunct2)/len(LsWords):.3}, unique: {len(set(LsWordsWithPunct2))}')

# **Ways to improve quality of parsed words**

Apply these as needed. Order matters in computational and model performance.

1. Remove structured tags (HTML, XML, CSS, JSON,...)
1. Tokenize text into words or sentences
1. Remove accented characters
1. Expand contractions
1. Remove special characters
1. Lower-case documents
1. Text correction
  1. Repeating characters
  1. Spelling
1. Stemming
1. Lemmatization
1. Stopwords removal
1. Replacing emojis
1. Parts of speech (POS) tagging improves lemmatizers ([e.g.](https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#wordnetlemmatizerwithappropriatepostag))

## **Removal of duplicated charaters**


In [None]:
def DedupTokens(LsTokens=['NNNo', 'Noooo', 'NoOoOoOo']):
  # pattern is precompiled for speed
  repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')  # find dupped second group
  def replace(old_word):
    # recursively remove dupped characters till the word is found in WordNet lexicon
    if wordnet.synsets(old_word): return old_word
    # return groups 1,2,3 only (without a dup group \2):
    new_word = repeat_pattern.sub(r'\1\2\3', old_word) 
    return replace(new_word) if new_word != old_word else new_word  # stopping criterion
  return [replace(word) for word in LsTokens]  # fix each word in the list
DedupTokens()

Some drawbacks

* It treats different letter casings as different characters
* It can only fix words found in WordNet. `subbookkeeper` is not recognized

In [None]:
DedupTokens(['bittter', 'bassoonn', 'bookkeeper', 'subbookkeeper'])

In [None]:
sPhrase = 'Learning at eCornell and Cornell is realllllyyy amaaazingggg'
sFixedPhrase = DedupTokens(nltk.word_tokenize(sPhrase))
' '.join(sFixedPhrase)

# **Spell correction**

* Textblob library has many useful NLP functions, including spelling correction.

In [None]:
from textblob import Word
print(Word('fianlly').correct())

* It returns a score of identified candidate 

In [None]:
print(Word('fianlly').spellcheck())

* Multiple candidates are also scored

In [None]:
print(Word('flaot').spellcheck())

* An example of corrected text

In [None]:
sScrambled = "The ordirng of leeters in a wrod is not imporant. As loang as the frist and lsat lteters are in thier place, we can stll reead the txet."
LsCorrected = [Word(s).correct() for s in sScrambled.split()]
print(' '.join(LsCorrected))

In [None]:
from textblob import Word
print(Word('fianlly').correct())