In [1]:
from nlp_preprocessor import nlp_preprocessor
import pandas as pd

# Testing the preprocessor class

In [13]:
corpus = ['BOB the builder', 'is a strange', 'caRtoon type thing']
nlp = nlp_preprocessor()
nlp.fit(corpus)
pd.DataFrame(nlp.transform(corpus).toarray(), columns=nlp.vectorizer.get_feature_names())

Unnamed: 0,bob,builder,cartoon,is,strange,the,thing,type
0,1,1,0,0,0,1,0,0
1,0,0,0,1,1,0,0,0
2,0,0,1,0,0,0,1,1


In [15]:
nlp.transform(corpus).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 1]])

# Testing the Supervised Learning Class

In [3]:
from supervised_nlp import supervised_nlp

In [4]:
from sklearn import datasets

categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball']
ng_train = datasets.fetch_20newsgroups(subset='train', 
                                       categories=categories, 
                                       remove=('headers', 
                                               'footers', 'quotes'))
ng_train_data = ng_train.data
ng_train_targets = ng_train.target

ng_test = datasets.fetch_20newsgroups(subset='test', 
                                       categories=categories, 
                                       remove=('headers', 
                                               'footers', 'quotes'))

ng_test_data = ng_test.data
ng_test_targets = ng_test.target

In [5]:
from sklearn.naive_bayes import MultinomialNB

nlp_pipe = supervised_nlp(MultinomialNB(), nlp)
nlp_pipe.fit(ng_train_data, ng_train_targets)
nlp_pipe.score(ng_test_data, ng_test_targets)

0.9076923076923077

# Testing the Topic Modeling Class

In [6]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

from topic_modeling_nlp import topic_modeling_nlp


cv = CountVectorizer(stop_words='english', token_pattern='\\b[a-z][a-z]+\\b')
cleaning_pipe = nlp_preprocessor(vectorizer=cv)
topic_chain = topic_modeling_nlp(TruncatedSVD(n_components=15), preprocessing_pipeline=cleaning_pipe)

topic_chain.fit(ng_train_data)
topic_chain.print_topics()

Topic #0: jpeg image edu file graphics gif images format color pub
Topic #1: edu graphics pub data mail ray ftp send com objects
Topic #2: jesus god atheists matthew people atheism does religious said religion
Topic #3: image data processing analysis software available display tools tool user
Topic #4: jesus matthew prophecy messiah psalm isaiah david said lord israel
Topic #5: argument fallacy conclusion example true argumentum ad premises false valid
Topic #6: data available ftp sgi grass vertex pci motecc model info
Topic #7: game year don good think hit won runs team home
Topic #8: god posting subject response typical information universe einstein bush evidence
Topic #9: den radius double theta sqrt pi sin rtheta pole pt
Topic #10: program read menu bits display change file pressing want don
Topic #11: lost program won cubs atheism game display menu bits home
Topic #12: game runs second hit run graphics cubs home sunday win
Topic #13: atheism alt faq send edu usenet news files otis

# Testing Saving and Loading a Pipeline

In [7]:
from nltk.stem import PorterStemmer

nlp = nlp_preprocessor(stemmer=PorterStemmer())
nlp.save_pipe('save_pipeline')
type(nlp.stemmer)

nltk.stem.porter.PorterStemmer

In [9]:
nlp2 = nlp_preprocessor()
type(nlp2.stemmer)

NoneType

In [11]:
nlp2.load_pipe('save_pipeline')
type(nlp2.stemmer)

nltk.stem.porter.PorterStemmer