In [132]:
# !pip install nltk

import nltk
# nltk.download()

from nltk.corpus import stopwords

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Stopword removal 

Sometimes, some extremely common words which would appear to be of little value in helping select documents matching a user need are excluded from the vocabulary entirely. These words are called stop words . The general strategy for determining a stop list is to sort the terms by collection frequency (the total number of times each term appears in the document collection), and then to take the most frequent terms, often hand-filtered for their semantic content relative to the domain of the documents being indexed, as a stop list , the members of which are then discarded during indexing

In [201]:
sentence = '''Sometimes, some extremely common words which would appear to be of little value in helping select documents matching a user need are excluded from the vocabulary entirely. These words are called stop words . The general strategy for determining a stop list is to sort the terms by collection frequency (the total number of times each term appears in the document collection), and then to take the most frequent terms, often hand-filtered for their semantic content relative to the domain of the documents being indexed, as a stop list , the members of which are then discarded during indexing'''
word_list = sentence.split()

filtered_words = [word for word in word_list if word not in stopwords.words('english')]
for f in filtered_words[:25]:
    print(f,end=' ') #first 25

Sometimes, extremely common words would appear little value helping select documents matching user need excluded vocabulary entirely. These words called stop words . The general 

In [202]:
stopwords_found = [word for word in word_list if word in stopwords.words('english')]
for f in stopwords_found[:25]:
    print(f,end=' ') #first 25

some which to be of in a are from the are for a is to the by of each in the and then to the 

# Syntex - part of speach tagging  

* [good SO post](https://stackoverflow.com/a/30823202/5728614) 


In [130]:
from nltk import word_tokenize, pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def tag_words(text):

    if type(text)== str:
        text = word_tokenize(text)
        
    tags_pos_tag = pos_tag(text)
    
    treebankTagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
    tags_maxent_treebank = treebankTagger.tag(text)
 
    print( '{:<20s} {:<15s} {:<10s}'.format("word", "default tagger", "maxent treebank tagger"))
    for i in range(len(tags_pos_tag)):      
        print( '{:<20s} {:<15s} {:<10s}'.format(tags_pos_tag[i][0], tags_pos_tag[i][1],tags_maxent_treebank[i][1]) )

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kali\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Note that `text.split() == word_tokenize(text)`
> True 

In [79]:
text = "The quick brown fox jumps over the lazy dog"
tag_words(text)# returns void 

word                 default tagger  maxent treebank tagger
The                  DT              DT        
quick                JJ              NN        
brown                NN              NN        
fox                  NN              NN        
jumps                VBZ             NNS       
over                 IN              IN        
the                  DT              DT        
lazy                 JJ              NN        
dog                  NN              NN        


# Morphology - stemming
* [stem](https://pythonprogramming.net/stemming-nltk-tutorial/)
* [affix](https://stackoverflow.com/questions/55425525/can-the-porter-stemmer-return-the-affix-rather-than-the-stem)

In [204]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import sent_tokenize 

ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
print('example words: ', example_words)
for w in example_words:
    print(ps.stem(w))

example words:  ['python', 'pythoner', 'pythoning', 'pythoned', 'pythonly']
python
python
python
python
pythonli


In [116]:
text = "gracefully full of force for the sake of being forcefully full of grace"
words = word_tokenize(text)

print(text,'\nbecomes,')
for w in words:
    print(ps.stem(w), end = ' ')

print("\n------------------")
text = "recharging recharg charging builderhood builder preclassified preclass duckhunter"
words = word_tokenize(text)
print(text,'\nbecomes,')

for w in words:
    print(ps.stem(w), end = ' ')

gracefully full of force for the sake of being forcefully full of grace 
becomes,
grace full of forc for the sake of be forc full of grace 
------------------
recharging recharg charging builderhood builder preclassified preclass duckhunter 
becomes,
recharg recharg charg builderhood builder preclassifi preclass duckhunt 

## Tagging stemmed

In [117]:
tag_words(text.replace('.',''))

word                 default tagger  maxent treebank tagger
recharging           VBG             VBG       
recharg              NN              NN        
charging             VBG             VBG       
builderhood          NN              NN        
builder              NN              NN        
preclassified        VBD             VBD       
preclass             NN              NN        
duckhunter           NN              NN        


## Tagging non-stemmed

In [118]:
text_stemmed = [ps.stem(w) for w in words if w != '.'] # or .replace('.',''); IDK which is faster... 
tag_words(text_stemmed)

word                 default tagger  maxent treebank tagger
recharg              NN              NN        
recharg              NN              NN        
charg                NN              NN        
builderhood          NN              NN        
builder              NN              NN        
preclassifi          NN              NN        
preclass             NN              NN        
duckhunt             NN              NN        


### Gettign affixes (conversely, removing the stem)  
#### Marginal "success"

In [169]:
def recursion(word):
    stem = ps.stem(word)
    print('word: ', word)
    print('Stem: ', stem)
    if(word.replace(stem,'') != word):
        print('Affix:', word.replace(stem,''))
    else: 
        print("*fail or done*")
    print("\n")
    
    if stem != word:
        recursion(stem)

In [170]:
word = 'eating'
recursion(word)

word:  eating
Stem:  eat
Affix: ing


word:  eat
Stem:  eat
Affix: 




In [172]:
word = 'recharging'
recursion(word)

word:  recharging
Stem:  recharg
Affix: ing


word:  recharg
Stem:  recharg
Affix: 




In [173]:
word = 'preclassified'
recursion(word)

word:  preclassified
Stem:  preclassifi
Affix: ed


word:  preclassifi
Stem:  preclassifi
Affix: 




# Phonology - environment finder

In [200]:
targets = ['o','a', 'p', 't', 'ʌ']
word_list = "pot bot mop fought cot on top of octopus ʌ"

word_list = ' ' + word_list + ' '

print('Sentance ',word_list)

word_list = word_list.replace(" ", "#")
for target in targets:
    print('Current target is: ',target,end ='\n\n')
    
    word_list_current = word_list.replace(target, "_")
    
    for i in range(len(word_list)):
        if(word_list_current[i] == '_'):
            print(word_list_current[i-1:i+2])

Sentance   pot bot mop fought cot on top of octopus ʌ 
Current target is:  o

p_t
b_t
m_p
f_u
c_t
#_n
t_p
#_f
#_c
t_p
Current target is:  a

Current target is:  p

#_o
o_#
o_#
o_u
Current target is:  t

o_#
o_#
h_#
o_#
#_o
c_o
Current target is:  ʌ

#_#


In [None]:
# https://stackoverflow.com/a/49269085/5728614
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')