# PGM Exercise - Preprocessing the training data

### Setup
Uncomment and run the below block for the first time, to install the required packages

In [35]:
#!conda install -c conda-forge scikit-learn -y
#!conda install spacy -y
#!conda install tqdm -y
#!conda install pandas -y
#!python -m spacy download en_core_web_sm

In [36]:
from sklearn.datasets import fetch_20newsgroups
import spacy
import tqdm
from collections import Counter
import pandas as pd
import re

In [4]:
data = fetch_20newsgroups(subset="train", remove=('headers', 'footers', 'quotes'))
nlp = spacy.load("en_core_web_sm")

print('Number of docs:', len(data.data))

Number of docs: 11314


## Convert docs to list of tokens

In [18]:
unprocessed_docs = data['data']
def pre_process_docs_before_vocab(unprocessed_docs):
    docs = []
    patterns_and_replacements = {
        '<EMAIL>' : re.compile(r'^[\w\.-]+@[\w\.-]+\.\w+$')
    }
    
    for udoc in tqdm.tqdm_notebook(nlp.pipe(unprocessed_docs, batch_size=64), total=len(unprocessed_docs)):
        doc = []
        for token in udoc:
            if token.is_alpha:
                doc.append(token.text.lower())
            elif token.is_punct:
                # since punctuation would be one of the syntactic classes
                doc.append(token.text[0]) # why just text[0]? to handle cases like '!!!' or '...'
            elif token.is_space:
                # all space char including '\n' provides no meaning 
                continue
            elif token.is_digit:
                doc.append('<NUM>') 
            elif token.is_currency:
                doc.append('<CUR>')
            else:
                for replacement, pattern in patterns_and_replacements.items():
                    if pattern.match(token.text):
                        doc.append(replacement)
                        break
                else:
                    doc.append('<UNK>')
        docs.append(doc)
    return docs

docs = pre_process_docs_before_vocab(unprocessed_docs)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for udoc in tqdm.tqdm_notebook(nlp.pipe(unprocessed_docs, batch_size=64), total=len(unprocessed_docs)):


  0%|          | 0/11314 [00:00<?, ?it/s]

## Build vocabulary 

We will also remove the words that occur only once, since there is a good chance that those are typos

In [54]:
def build_vocab(docs, rare_words_threshold): 
    vocab = Counter()
    for doc in tqdm.tqdm_notebook(docs):
        vocab.update(doc)

    # ignore words that are rare
    vocab = Counter({key: count for key, count in vocab.items() if count > rare_words_threshold})
    return vocab

vocab = build_vocab(docs, rare_words_threshold=1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for doc in tqdm.tqdm_notebook(docs):


  0%|          | 0/11314 [00:00<?, ?it/s]

In [57]:
def remove_out_of_vocab_tokens(docs, vocab):
    oov_count = 0
    for doc in docs:
        for ind, token in enumerate(doc):
            if token not in vocab:
                doc[ind] = '<OOV>'
                print("YES")
                raise
                oov_count += 1
    vocab['<OOV>'] = oov_count
    return docs, vocab
                

docs, vocab = remove_out_of_vocab_tokens(docs, vocab)

## View the processed data

In [58]:
def compare_text_after_pre_processing(index, orig_docs, pre_processed_docs):
    print("------- Original -------")
    print(orig_docs[index])
    print("\n\n\n------- After preprocessing -------")
    print(' '.join(pre_processed_docs[index]))

print("Vocab size: ", len(vocab))
print("\n\nExample after preprocessing")
compare_text_after_pre_processing(index=0, orig_docs=data['data'], pre_processed_docs=docs)





most_common = vocab.most_common()[:30]
least_common = vocab.most_common()[-30:]

print("\n\n\n------- MOST COMMON ITEMS IN VOCAB -------")
print(pd.DataFrame(most_common, columns=['Word', 'Count']))


print("\n\n\n------- Least COMMON ITEMS IN VOCAB -------")
print(pd.DataFrame(least_common[::-1], columns=['Word', 'Count']))

Vocab size:  33300


Example after preprocessing
------- Original -------
I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.



------- After preprocessing -------
i was wondering if anyone out there could enlighten me on this car i saw the other day . it was a <NUM> - door sports car , looked to be from the late <UNK> early <UNK> . it was called a bricklin . the doors were really small . in addition , the front bumper was separate from the rest of the body . this is all i know . if anyone can <OOV> a model name , engine specs , years of production , where this car is 