# Natural Language Processing for Beginners

*What is NLP?*
- This is a type of data analysis to help computer interprete natural language as a human would.
- Process text from documenets usually unstructured text.
- is a subset of artificial intelligence that helps computers understand, interpret and manipulate human language.

NLTK: Natural Language Tool kit
This is one of the most powerful platform/library for analyzing natural languages using Python programming language. 

Learn More:
https://www.nltk.org/

In [1]:
import nltk
import pandas as pd

# Counting words

In [None]:
#  We will be downloading text from the free ebooks library gutenberg
nltk.download('gutenberg')

In [None]:
nltk.corpus.gutenberg.fileids()

In [None]:
# Grab words in the shakespeare-hamlet
sh_ham = nltk.corpus.gutenberg.words("shakespeare-hamlet.txt")

# Preview the first 20 words
sh_ham[:20]


In [None]:
# Using .count we can get number of times a word appears
sh_ham.count("Hamlet")

In [2]:
# Lets take a look using inuagural speech
nltk.download('inaugural')

[nltk_data] Downloading package inaugural to
[nltk_data]     /Users/princessiria/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!


True

In [6]:
nltk.corpus.inaugural.fileids()

In [8]:
for speech in nltk.corpus.inaugural.fileids():
    total_words = len(nltk.corpus.inaugural.words(speech))
    print(speech + " has a total of " + str(total_words) + " words.")
    

1789-Washington.txt has a total of 1538 words.
1793-Washington.txt has a total of 147 words.
1797-Adams.txt has a total of 2585 words.
1801-Jefferson.txt has a total of 1935 words.
1805-Jefferson.txt has a total of 2384 words.
1809-Madison.txt has a total of 1265 words.
1813-Madison.txt has a total of 1304 words.
1817-Monroe.txt has a total of 3693 words.
1821-Monroe.txt has a total of 4909 words.
1825-Adams.txt has a total of 3150 words.
1829-Jackson.txt has a total of 1208 words.
1833-Jackson.txt has a total of 1267 words.
1837-VanBuren.txt has a total of 4171 words.
1841-Harrison.txt has a total of 9165 words.
1845-Polk.txt has a total of 5196 words.
1849-Taylor.txt has a total of 1182 words.
1853-Pierce.txt has a total of 3657 words.
1857-Buchanan.txt has a total of 3098 words.
1861-Lincoln.txt has a total of 4005 words.
1865-Lincoln.txt has a total of 785 words.
1869-Grant.txt has a total of 1239 words.
1873-Grant.txt has a total of 1478 words.
1877-Hayes.txt has a total of 2724 w

In [21]:
 nltk.download('punkt') # helps divides a text into a list of sentences

# Lets grab the year, name of president and average word per speech into a dataframe
extract = pd.DataFrame([speech[5:].rstrip('.txt'), int(speech[:4]), 
                        int(len(nltk.corpus.inaugural.words(speech))/len(nltk.corpus.inaugural.sents(speech)))] for speech in nltk.corpus.inaugural.fileids())

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/princessiria/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
extract.head()

Unnamed: 0,0,1,2
0,Washington,1789,64
1,Washington,1793,36
2,Adams,1797,69
3,Jefferson,1801,46
4,Jefferson,1805,52


In [23]:
extract.columns = ["President", "Year", "AVG_Words_Per_Speech"]

In [24]:
extract.head()

Unnamed: 0,President,Year,AVG_Words_Per_Speech
0,Washington,1789,64
1,Washington,1793,36
2,Adams,1797,69
3,Jefferson,1801,46
4,Jefferson,1805,52


In [27]:
# Vizualize the data.
import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

In [30]:
# Build App
app = JupyterDash(__name__)
app.layout = html.Div([
    html.H1("Inuagural Speech View"),
    dcc.Graph(id='graph'),
    html.Label([
        "colorscale",
        dcc.Dropdown(
            id='colorscale-dropdown', clearable=False,
            value='plasma', options=[
                {'label': c, 'value': c}
                for c in px.colors.named_colorscales()
            ])
    ]),
])
# Define callback to update graph
@app.callback(
    Output('graph', 'figure'),
    [Input("colorscale-dropdown", "value")]
)
def update_figure(colorscale):
    return px.scatter(
        extract, x="Year", y="AVG_Words_Per_Speech", color="Year",
        color_continuous_scale=colorscale,
        render_mode="webgl", title="AVG_Words_Per_Speech") 

In [31]:
# Run app and display result inline in the notebook
app.run_server(mode='inline')

# Frequency Distribution

In [32]:
# Grab words in the shakespeare-hamlet
sh_ham = nltk.corpus.gutenberg.words("shakespeare-hamlet.txt")

sh_ham_freqDist = nltk.FreqDist(sh_ham)

In [33]:
sh_ham_freqDist

FreqDist({',': 2892, '.': 1886, 'the': 860, "'": 729, 'and': 606, 'of': 576, 'to': 576, ':': 565, 'I': 553, 'you': 479, ...})

In [34]:
sh_ham_freqDist["Hamlet"] #It can appear to work as counting.

99

In [35]:
sh_ham_20 = sh_ham_freqDist.most_common(20) #Get the 20 most connon words.

In [37]:
# sh_ham_20

# N-Grams

In [38]:
sample_text = "I love Special Agent Gibbs, Abby, Dinozo, McGee and NCIS"

In [39]:
# Token wwords from our text
tokens = nltk.word_tokenize(sample_text)

In [43]:
bigrams_words = nltk.bigrams(tokens)
for bigramWord in bigrams_words:
    print(bigramWord) 

In [46]:
from nltk.util import ngrams

In [47]:
bi_ngrams = ngrams(tokens, 2)

for bi_ngram in bi_ngrams:
    print(bi_ngram)

('I', 'love')
('love', 'Special')
('Special', 'Agent')
('Agent', 'Gibbs')
('Gibbs', ',')
(',', 'Abby')
('Abby', ',')
(',', 'Dinozo')
('Dinozo', ',')
(',', 'McGee')
('McGee', 'and')
('and', 'NCIS')


In [48]:
def n_grams(text, num):
    tokens = nltk.word_tokenize(text)
    num_grams = ngrams(tokens, num)
    return num_grams

In [49]:
two_grams = n_grams(sample_text, 2)

In [50]:
for x in two_grams:
    print(x)

('I', 'love')
('love', 'Special')
('Special', 'Agent')
('Agent', 'Gibbs')
('Gibbs', ',')
(',', 'Abby')
('Abby', ',')
(',', 'Dinozo')
('Dinozo', ',')
(',', 'McGee')
('McGee', 'and')
('and', 'NCIS')


Thank you and look forward to my study notes for next week.