# Basic NLP (Tokenization, Lemmatization, etc.)

### Imports

In [None]:
"""
Import Statements
"""

import pandas as pd

# Base
from collections import Counter
import re

# Plotting
import squarify
import matplotlib.pyplot as plt
import seaborn as sns

# NLP Libraries
import spacy
from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer

## Tokenization

In [None]:
nlp = spacy.load("en_core_web_lg")

STOP_WORDS = nlp.Defaults.stop_words.union(['coffee',' ', 'i', 'it', "it's", 'it.', 'the', 'this', 'place'])

tokenizer = Tokenizer(nlp.vocab)


def tokenize(dataframe):
    tokens = []
    """ Update those tokens w/o stopwords"""
    for doc in tokenizer.pipe(?dataframe['full_review_text']?, batch_size=500):

        doc_tokens = []

        for token in doc:
            if token.text.lower() not in STOP_WORDS:
                doc_tokens.append(token.text.lower())

        tokens.append(doc_tokens)
        
    return tokens

?df?['tokens'] = tokenize(df)

## Lemmatization

In [None]:
def get_lemmas(text):

    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

In [None]:
# Example of get lemmas
?df?['lemmas'] = ?df['reviews.text']?.apply(get_lemmas)

## Vector Representations
## New Imports

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [None]:
# Count Vectorizer
vect = CountVectorizer(stop_words='english')

?data0 = data['description'].tolist()? # Creates the list of strings from the dataframe 'data' 

vect.fit(data0) #Requires an iterable of the strings to produce vectors

dtm = vect.transform(data0)
dtm = pd.DataFrame(dtm.todense(), columns=vect.get_feature_names())

In [None]:
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', 
                        ngram_range=(1,2),
                        max_df=.97,
                        min_df=3,
                        tokenizer=tokenize)

# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(data0)

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.head()

## Document Classification

In [None]:
# Basic NLP pipeline
from sklearn.pipeline import Pipeline

vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
clf = RandomForestClassifier()

pipe = Pipeline([('vect', vect), ('clf', clf)])

In [None]:
from sklearn.decomposition import TruncatedSVD

# Singular value decomposition
svd = TruncatedSVD(n_components=100, # Just here for demo. 
                   algorithm='randomized',
                   n_iter=10)


# LSI: Latent semantic indexing
lsi = Pipeline([('vect', vect), ('svd', svd)])


# Pipe
pipe = Pipeline([('lsi', lsi), ('clf', rfc)])

### Word Embeddings

In [None]:
doc = nlp("Two bananas in pyjamas")

In [None]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [None]:
X = get_word_vectors(data.data)

rfc = RandomForestClassifier()

rfc.fit(X, data.target)

## Topic Modeling
### Imports

In [None]:
import gensim

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora

from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis.gensim

In [None]:
id2word = corpora.Dictionary(tokens)# tokens represents a list of tokenized strings lists

# Let's remove extreme values from the dataset
id2word.filter_extremes(no_below=5, no_above=0.95)

corpus = [id2word.doc2bow(text) for text in tokens]

lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   random_state=723812,
                   num_topics = 15,
                   passes=10,
                   workers=8
                  )

In [None]:
pyLDAvis.enable_notebook()

pyLDAvis.gensim.prepare(lda, corpus, id2word)

distro = [lda[d] for d in corpus]

def update(doc):
        d_dist = {k:0 for k in range(0,15)}
        for t in doc:
            d_dist[t[0]] = t[1]
        return d_dist
    
new_distro = [update(d) for d in distro]