# Setup

Run the cell below

In [3]:
import pandas as pd
from utils.nlp_basics import *

# Outline

1. Tokenization
2. Lemmatization
3. Stemming
4. Part-of-speech tagging
5. Stopwords

![Tokenization](./slides/tokenization.png)

# Let's play with the string sequence `cake_wikipedia`

## 1. Simplest tokenizer: split on spaces

Run the cell below. Here we split the sequence by spaces. How would you describe these tokens?

In [8]:
# The first few sentences from the wikipedia page on Cake https://en.wikipedia.org/wiki/Cake
cake_wikipedia = 'Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked. In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations that can be simple or elaborate, and that share features with other desserts such as pastries, meringues, custards, and pies.'

# calling .split() on a string will split the string on spaces
tokens = cake_wikipedia.split()
show_tokens(tokens)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55
Text,Cake,is,a,form,of,sweet,food,made,from,"flour,","sugar,",and,other,"ingredients,",that,is,usually,baked.,In,their,oldest,"forms,",cakes,were,modifications,of,"bread,",but,cakes,now,cover,a,wide,range,of,preparations,that,can,be,simple,or,"elaborate,",and,that,share,features,with,other,desserts,such,as,"pastries,","meringues,","custards,",and,pies.


## 2. Split on spaces and separate punctuation from words.

Run the cell below. How would you describe these tokens?

In [10]:
# nltk is a library that is open for anyone to use. 
# It stands for "natural language tool kit" and has many useful functions
from nltk.tokenize import word_tokenize

# We use nltk's function "word_tokenize"
tokens = word_tokenize(cake_wikipedia)

show_tokens(tokens)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66
Text,Cake,is,a,form,of,sweet,food,made,from,flour,",",sugar,",",and,other,ingredients,",",that,is,usually,baked,.,In,their,oldest,forms,",",cakes,were,modifications,of,bread,",",but,cakes,now,cover,a,wide,range,of,preparations,that,can,be,simple,or,elaborate,",",and,that,share,features,with,other,desserts,such,as,pastries,",",meringues,",",custards,",",and,pies,.


## 3. Split on syllables.

Run the cell below. How would you describe these tokens?

In [12]:
from utils.syllable import *

syllable_tokenize = SyllableTokenizer()
tokens = syllable_tokenize.tokenize(cake_wikipedia)

# Show table
show_tokens(tokens)

  " assigning as vowel: '{}'".format(c)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132
Text,Ca,ke,i,s a,,for,m o,f,swee,t,foo,d,ma,de,fro,m,flour,",",,su,gar,",",an,d ot,he,r in,gre,dients,",",t,ha,t i,s u,sual,ly,ba,ked,.,I,n t,hei,r ol,des,t,forms,",",,ca,ke,s,we,re,mo,di,fi,ca,tion,s o,f,bread,",",,bu,t,ca,ke,s,no,w,co,ve,r a,,wi,de,ran,ge,o,f,pre,pa,ra,tion,s t,ha,t,ca,n,be,sim,ple,o,r e,la,bo,ra,te,",",an,d t,ha,t s,ha,re,fea,tu,re,s,wit,h ot,he,r,des,ser,ts,suc,h a,s,pas,tries,",",,me,rin,gues,",",,cus,tards,",",an,d,pies.


## 4. Challenge: What are some tokenization considerations to make if you're working with tweets?

Try making a tokenizer that keeps hashtags with the # and user handles with the @.

In [13]:
tweet = 'Today, I am learning how to #tokenize with @AI4All!!!!!'

def tokenizer(string):
    
    ## Your code (use as many lines as you like)
    tokens = 
    
    
    return tokens


tokens = tokenizer(string)

SyntaxError: invalid syntax (<ipython-input-13-3a48b57a1af3>, line 6)

## Would tokenization in English look the same as other languages?

In [15]:
french = "C'est en effet tout à fait dans la ligne des positions que notre Parlement a toujours adoptées."


In [19]:
tokens = french.split()
show_tokens(tokens)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Text,C'est,en,effet,tout,à,fait,dans,la,ligne,des,positions,que,notre,Parlement,a,toujours,adoptées.


In [18]:
tokens = word_tokenize(french, language='french')
show_tokens(tokens)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
Text,C'est,en,effet,tout,à,fait,dans,la,ligne,des,positions,que,notre,Parlement,a,toujours,adoptées,.


![Lemmatization](./slides/lemmas.png)

# Lemmatization

In [20]:
tokens = word_tokenize(cake_wikipedia)

import spacy

# Uses nlp pipeline from spacy to obtain linguistic features
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

doc = nlp("".join(cake_wikipedia))
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']

# Get lemmas
lemmas = [token.lemma_ for token in doc]

# Here we are making a list of original tokens and a list of stemmed tokens for only the tokens that changed after stemming
lemmas_diff = [lemma for token, lemma in zip(tokens, lemmas) if token.lower() != lemma]
og = [token for token, lemma in zip(tokens, lemmas) if token.lower() != lemma]

# Show table
show_lemmas(og, lemmas_diff)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Text,is,made,ingredients,is,baked,In,their,oldest,forms,cakes,were,modifications,cakes,preparations,features,desserts,pastries,meringues,custards,pies
Lemmas,be,make,ingredient,be,bake,In,-PRON-,old,form,cake,be,modification,cake,preparation,feature,dessert,pastry,meringue,custard,pie


# Stemming

In [None]:
from nltk.stem.snowball import PorterStemmer

# Define a module that will stem the text for us
stemmer = PorterStemmer()

# Use the stemmer on our text
stemmed = [stemmer.stem(token) for token in tokens]


# Here we are making a list of original tokens and a list of stemmed tokens for only the tokens that changed after stemming
og = [token for token, stem in zip(tokens, stemmed) if token.lower() != stem]
stemmed_diff = [stem for token, stem in zip(tokens, stemmed) if token.lower() != stem]

# Put stemmed data and text in a dataframe so we can output a table
data = {'Stems': stemmed_diff, 'Text':og}
df = pd.DataFrame(data, columns = ['Text', 'Stems'])

# Show table
df.T

# Part-of-speech tagging

In [None]:
import pandas as pd

# https://en.wikipedia.org/wiki/Cake
cake_wikipedia = 'Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked. In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations that can be simple or elaborate, and that share features with other desserts such as pastries, meringues, custards, and pies.'

# Uses nlp pipeline from spacy to obtain linguistic features
doc = nlp("".join(cake_wikipedia))

data = {'Text':[token.text for token in doc], 'Lemma':[token.lemma_ for token in doc], 'Part-of-speech':[token.pos_ for token in doc], 'Dependency':[token.dep_ for token in doc], 'Shape':[token.shape_ for token in doc], 'Is Alpha':[token.is_alpha for token in doc], 'Stopword':[token.is_stop for token in doc]}
df = pd.DataFrame (data, columns = ['Text', 'Part-of-speech'])


df.T # show data (T means transpose, excluding the T is fine too)

# Stopwords

In [None]:
df = pd.DataFrame (data, columns = ['Text', 'Stopword'])
df.T

In [None]:
stopwords = df.loc[df['Stopword'] == True]
stopwords.T

# References

1. https://www.nltk.org/api/nltk.tokenize.html
2. https://www.nltk.org/_modules/nltk/tokenize/sonority_sequencing.html#SyllableTokenizer
3. https://spacy.io/api/lemmatizer
4. https://spacy.io/usage/linguistic-features
5. https://universaldependencies.org/docs/u/pos/