# Noun Phrase Chunking

In [1]:
import re
import pandas as pd

import spacy
from spacy import displacy

import nltk
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, pos_tag

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
f=open('chatbot.txt','r',errors = 'ignore')
raw = f.read()
raw

'A chatbot (also known as a talkbot, chatterbot, Bot, IM bot, interactive agent, or Artificial Conversational Entity) is a computer program or an artificial intelligence which conducts a conversation via auditory or textual methods. Such programs are often designed to convincingly simulate how a human would behave as a conversational partner, thereby passing the Turing test. Chatbots are typically used in dialog systems for various practical purposes including customer service or information acquisition. Some chatterbots use sophisticated natural language processing systems, but many simpler systems scan for keywords within the input, then pull a reply with the most matching keywords, or the most similar wording pattern, from a database.\nThe term "ChatterBot" was originally coined by Michael Mauldin (creator of the first Verbot, Julia) in 1994 to describe these conversational programs. Today, most chatbots are either accessed via virtual assistants such as Google Assistant and Amazon 

## Chunking with Spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


In [5]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


In [6]:
doc = nlp(raw)
for chunk in doc.noun_chunks:
    print(chunk.text)

A chatbot
a talkbot
chatterbot
Bot
IM bot
interactive agent
Artificial Conversational Entity
a computer program
an artificial intelligence
which
a conversation
auditory or textual methods
Such programs
a human
a conversational partner
the Turing test
Chatbots
dialog systems
various practical purposes
customer service
information acquisition
Some chatterbots
sophisticated natural language processing systems
many simpler systems
keywords
the input
a reply
the most matching keywords
the most similar wording pattern
a database
The term
"ChatterBot
Michael Mauldin
creator
the first Verbot, Julia
these conversational programs
most chatbots
virtual assistants
Google Assistant
Amazon Alexa
apps
Facebook Messenger
WeChat
individual organizations' apps
websites
Chatbots
usage categories
conversational commerce
e
commerce
analytics
communication
customer support
design
developer tools
education
entertainment
finance
food
games
health
HR
marketing
news
productivity
shopping
sports
travel
utilities

## Chunking with NLTK

The method for chunking text with NLTK is slightly different than with Spacy.  It's more complicated, but ultimatley more flexible.  It uses the combination of POS tagging and `RegEx` to parse groups of words for patterns.  Because of the use of `RegEx`, it's possible to create any sets of POS tags.

In [7]:
def reg_chunker(sent, expression):

    sent = pos_tag(word_tokenize(sent))
    cp = nltk.RegexpParser(expression)
    chunked = cp.parse(sent)

    for chunk in chunked.subtrees(filter=lambda t: t.label() == 'NP'):
        print(chunk)

In [8]:
sent = "Autonomous cars shift insurance liability toward manufacturers"
# Chunk 0: Noun followed by Noun
reg_chunker(sent, r'NP: {<JJ>+<NN.?>}')

(NP Autonomous/JJ cars/NNS)


In [9]:
# Chunk 1: Noun followed by Noun
reg_chunker(raw, r'NP: {<NN.?>+<NN.?>}')

(NP IM/NNP bot/NN)
(NP Artificial/NNP Conversational/NNP Entity/NNP)
(NP computer/NN program/NN)
(NP Turing/NNP test/NN)
(NP dialog/NN systems/NNS)
(NP customer/NN service/NN)
(NP information/NN acquisition/NN)
(NP language/NN processing/NN systems/NNS)
(NP simpler/NN systems/NNS)
(NP wording/NN pattern/NN)
(NP Michael/NNP Mauldin/NNP)
(NP Google/NNP Assistant/NNP)
(NP Amazon/NNP Alexa/NNP)
(NP Facebook/NNP Messenger/NNP)
(NP customer/NN support/NN)
(NP developer/NN tools/NNS)


In [10]:
# Chunk 2: Adjective Follwed by Singular Noun
reg_chunker(raw, r'NP: {<JJ>+<NN>}')

(NP interactive/JJ agent/NN)
(NP artificial/JJ intelligence/NN)
(NP conversational/JJ partner/NN)
(NP sophisticated/JJ natural/JJ language/NN)
(NP many/JJ simpler/NN)
(NP similar/JJ wording/NN)
(NP conversational/JJ commerce/NN)


In [None]:
# Close the file connection like you're supposed to
f.close()

## Chunking Clothing Reviews with Spacy

In [None]:
df = pd.read_csv("ClothingReviews.csv")
df.head()

In [None]:
df.dropna(subset=['Review Text'], inplace=True)

In [None]:
def np_tag(text):
    
    df = pd.DataFrame(columns = ['CHUNK'])
    
    doc = nlp(text)
    for chunk in doc.noun_chunks:
        df = df.append({'CHUNK': chunk.text}, ignore_index=True)
        
    return df

In [None]:
# Convert our reviews to lowercase to simplify our search
df["Review Text"] = df["Review Text"].str.lower()

# Find only reviews that have the word 'dress' in them
filter = df['Review Text'].str.contains('dress')
df_dress = df[filter].copy()
df_dress.shape

In [None]:
# Create an empty dataframe to store the results
df_np = pd.DataFrame(columns = ['CHUNK'])

# Iterate through the reviews and extra non-phrases for the reivews with "small or little"
df_np = np_tag(df_dress['Review Text'].to_string())
df_np.shape

In [None]:
# Show the top 10 noun phrases - notice that there are a lot of filler words (stop words)
df_np.groupby('CHUNK')['CHUNK'].count().\
    reset_index(name='count').sort_values(['count'],ascending=False).head(10)

In [None]:
# As opposed to removing stop words, we can filter out rows in the dataframe
# that have the stop words.  This is a better way for noun phrases since we won't lose
# the context of the phrases during our prior extraction. 
filter = (df_np['CHUNK'].str.contains('this')) | \
         (df_np['CHUNK'].str.contains('the')) | \
         (df_np['CHUNK'].str.contains('that')) | \
         (df_np['CHUNK'].str.contains('my')) | \
         (df_np['CHUNK'].str.contains('a')) | \
         (df_np['CHUNK'].str.len() < 6)
df_np = df_np[-filter]

In [None]:
# Filter for words with spaces, so that we get only phrases with more than one word.
filter = (df_np['CHUNK'].str.contains(' '))
df_np = df_np[filter]

In [None]:
df_np.groupby('CHUNK')['CHUNK'].count().\
    reset_index(name='count').sort_values(['count'],ascending=False).head(10)