In [1]:
from nlp_preprocessor import nlp_preprocessor
import pandas as pd

# Testing the preprocessor class

In [3]:
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

corpus = ['BOB the builder', 'is a strange', 'caRtoon type thing']
nlp = nlp_preprocessor(stemmer = lemma.lemmatize)
nlp.fit(corpus)
pd.DataFrame(nlp.transform(corpus).toarray(), columns=nlp.vectorizer.get_feature_names())

Unnamed: 0,bob,builder,cartoon,is,strange,the,thing,type
0,1,1,0,0,0,1,0,0
1,0,0,0,1,1,0,0,0
2,0,0,1,0,0,0,1,1


In [4]:
nlp.transform(corpus).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 1]])

# Testing the Supervised Learning Class

In [5]:
from supervised_nlp import supervised_nlp

In [6]:
from sklearn import datasets

categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball']
ng_train = datasets.fetch_20newsgroups(subset='train', 
                                       categories=categories, 
                                       remove=('headers', 
                                               'footers', 'quotes'))
ng_train_data = ng_train.data
ng_train_targets = ng_train.target

ng_test = datasets.fetch_20newsgroups(subset='test', 
                                       categories=categories, 
                                       remove=('headers', 
                                               'footers', 'quotes'))

ng_test_data = ng_test.data
ng_test_targets = ng_test.target

In [7]:
from sklearn.naive_bayes import MultinomialNB

nlp_pipe = supervised_nlp(MultinomialNB(), nlp)
nlp_pipe.fit(ng_train_data, ng_train_targets)
nlp_pipe.score(ng_test_data, ng_test_targets)

0.9113122171945701

# Testing the Topic Modeling Class

In [8]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

from topic_modeling_nlp import topic_modeling_nlp


cv = CountVectorizer(stop_words='english', token_pattern='\\b[a-z][a-z]+\\b')
cleaning_pipe = nlp_preprocessor(vectorizer=cv, stemmer=lemma.lemmatize)
topic_chain = topic_modeling_nlp(TruncatedSVD(n_components=15), preprocessing_pipeline=cleaning_pipe)

topic_chain.fit(ng_train_data)
topic_chain.print_topics()

Topic #0: image jpeg file edu gif format color data pub ftp
Topic #1: edu pub data graphics mail ftp ray send graphic com
Topic #2: jesus god wa atheist matthew people ha atheism christian prophecy
Topic #3: image data processing tool analysis software user available using sun
Topic #4: jesus matthew prophecy wa psalm messiah day isaiah david prophet
Topic #5: argument fallacy conclusion premise example true argumentum ad false valid
Topic #6: data available ftp grass sgi vertex package model pci motecc
Topic #7: wa game year team hit run don think good win
Topic #8: posting response god subject typical information universe einstein wa bush
Topic #9: den radius double theta sqrt pi sin rtheta pt pole
Topic #10: program read think menu don bit change file want pressing
Topic #11: program menu file read display game pressing change bit home
Topic #12: won lost idle new york sox year san american chicago
Topic #13: atheism alt faq send edu usenet news article answers newsgroup
Topic #14: 

# Testing Saving and Loading a Pipeline

In [9]:
from nltk.stem import PorterStemmer

nlp = nlp_preprocessor(stemmer=PorterStemmer().stem)
nlp.save_pipe('save_pipeline')
type(nlp.stemmer)

method

In [10]:
nlp2 = nlp_preprocessor()
type(nlp2.stemmer)

NoneType

In [11]:
nlp2.load_pipe('save_pipeline')
type(nlp2.stemmer)

method