# Saving and Loading Preprocessing Pipeline Demo

In [1]:
import sys
import os
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

import warnings
warnings.filterwarnings("ignore")

#sys.path.insert(0, os.path.abspath(os.path.join('nlp_library')))
sys.path.insert(0, os.path.abspath(os.path.join('..')))
import nlp.preprocessing as pre

## Read Data

In [2]:
df = pd.DataFrame({'text':fetch_20newsgroups(subset='train')['data']}).iloc[:100]
df.head()

Unnamed: 0,text
0,From: lerxst@wam.umd.edu (where's my thing)\nS...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...


## Object Creation and Fit
The class Pipeline is designed to be the central object of the preprocessing library.  

In [3]:
#############################################################
# The configuration below defines the way we process the data.
# Every line indicates a preprocessing action to perform.
# Feel free to modify individual values from 'True' to 'False'

config = {
        'appos': True,
        'slang': True,
        'sep_digit_text': True,
        'emoticons': False,
        'emoticons_del': True,
        'url': True, 
        'email': True,  # mask can be used
        'html': True,
        'proper_noun': True, # mask can be used
        'phone_number': 'PHONE_NUMBER', # mask can be used
        'repeated_chars': True,
        'single_char': False,
        'punct': True,
        'number': True, # mask can be used
        'extra_space': True,
        'stop_words': False,
        'lemmas': False
    }

#############################################################

# Create preprocessing pipeline
pp = pre.Pipeline(config, 
                  n_process=2, # Single process
                  progress_bar=True # Display progress bar
                 )
print(pp)
pp.fit(df['text'])
print(f'Number of documents processed {len(pp)}')
text0 = pp.text(0)
print(text0)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x000001E856010720>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x000001E85603E590>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x000001E855FE0880>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x000001E855FE0A00>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x000001E8560595C0>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x000001E8560C3040>), ('custom_extension', <nlp.preprocessing._CustomExtension object at 0x000001E85559F5E0>)]


HBox(children=(HTML(value='Running Spacy'), FloatProgress(value=0.0), HTML(value='')))


Number of documents processed 100
From where 's my thing Subject WHAT car is this Nntp Posting Host Organization University of Maryland College Park Lines I was wondering if anyone out there could enlighten me on this car I saw the other day It was a door sports car looked to be from the late s/ early s. It was called a Bricklin The doors were really small In addition the front bumper was separate from the rest of the body This is all I know If anyone can tellme a model name engine specs years of production where this car is made history or whatever information you have on this funky looking car please e mail Thanks IL brought to you by your neighborhood 


In [5]:
import json
with open('config.json', 'w') as outfile:
    json.dump(config, outfile)

## Saving object

In [4]:
pp.save('test.pkl')

In [5]:
# Check if after saving, the texts are still the same
text1 = pp.text(0)
text0 == text1

True

In [6]:
# Checking a few attributes
print(pp.phone_number)
print(pp.n_process)
print(pp.lemmas)

PHONE_NUMBER
2
False


In [7]:
# deleting pipeline object
del pp

## Load object

In [8]:
pp = pre.Pipeline.load('test.pkl')
print(pp)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x000001B453029220>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x000001B452D13180>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x000001B458172340>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x000001B458172940>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x000001B45319BAC0>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x000001B4530B7300>), ('custom_extension', <nlp.preprocessing._CustomExtension object at 0x000001B457E39070>)]


In [9]:
# Checking a few attributes
print(pp.phone_number)
print(pp.n_process)
print(pp.lemmas)

PHONE_NUMBER
2
False


In [10]:
# Checking the texts are still the same
text2 = pp.text(0)
text0 == text1 == text2

True

In [8]:
import nlp.embeddings as emb

In [11]:
class_name = getattr(emb, 'BagOfPOSEmbeddings')

In [12]:
obj = class_name()
obj

<nlp.embeddings.BagOfPOSEmbeddings at 0x1e85e1addc0>