## NLP Code

### Lematizer
Use to break down text into a more uniform pattern to eliminate things like plurals

In [5]:
import nltk
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize("variable to be lematized")

'variable to be lematized'

### Stemming

Similar to lemmatizing

In [16]:
from nltk.stem.porter import PorterStemmer

p_stemmer = PorterStemmer()

p_stemmer.stem('going willing traveler')

'going willing travel'

### Tokenizer

Use to break down text strings into a list of words, numbers and symbols

In [10]:
import nltk
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

tokenizer.tokenize('vairable. to be tokenized #tokens, $2020'.lower())

['vairable', '.', 'to', 'be', 'tokenized', '#tokens,', '$2020']

In [13]:
# \s+ splits on spaces
tokenizer_2 = RegexpTokenizer('\s+', gaps=True)
tokenizer_2.tokenize('vairable. to be tokenized #tokens, $2020'.lower())

['vairable.', 'to', 'be', 'tokenized', '#tokens,', '$2020']

### RegEx

Use to recognize patterns in text

In [7]:
import regex as re

# \d+ will pull out numeric digits
for i in '1 sample block of 2 text 123':
    print(re.findall('\d+', i), i)

['1'] 1
[]  
[] s
[] a
[] m
[] p
[] l
[] e
[]  
[] b
[] l
[] o
[] c
[] k
[]  
[] o
[] f
[]  
['2'] 2
[]  
[] t
[] e
[] x
[] t
[]  
['1'] 1
['2'] 2
['3'] 3


### Count Vecotrizer with Pipeline, Gridsearch, Log Reg
Effectively does most of the cleaning above itself and creates a bag of words

In [17]:
# Import CountVectorizer.
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate the "CountVectorizer" object, which is sklearn's
# bag of words tool.
cvec = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000) 

In [20]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(solver = 'lbfgs'))
])
pipe_params = {
    'cvec__max_features' : [2_000, 3_000, 4_000, 5_000],
    'cvec__min_df' : [2,3],
    'cvec__max_df' : [.90, .95],
    'cvec__ngram_range' : [(1,1), (1,2)]
                    }

In [21]:
# Grid Search
gs = GridSearchCV(pipe, param_grid = pipe_params, cv = 5)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

NameError: name 'X_train' is not defined