# Noun Phrase Chunking

In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk

import spacy
from spacy import displacy
from spacy.symbols import nsubj, VERB

%matplotlib inline

In [3]:
nlp = spacy.load("en_core_web_sm")

In [12]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

Autonomous cars Autonomous cars nsubj shift
insurance liability insurance liability dobj shift
manufacturers manufacturers pobj toward


In [13]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Autonomous cars nsubj shift VERB []
shift ROOT shift VERB [Autonomous cars, insurance liability, toward]
insurance liability dobj shift VERB []
toward prep shift VERB [manufacturers]
manufacturers pobj toward ADP []


In [14]:
displacy.render(doc, style='dep')

In [15]:
# Finding a verb with a subject from below — good
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
print(verbs)

{shift}


In [9]:
# Merge noun phrases and entities for easier analysis
nlp.add_pipe("merge_entities")
nlp.add_pipe("merge_noun_chunks")

TEXTS = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]
for doc in nlp.pipe(TEXTS):
    for token in doc:
        if token.ent_type_ == "MONEY":
            # We have an attribute and direct object, so check for subject
            if token.dep_ in ("attr", "dobj"):
                subj = [w for w in token.head.lefts if w.dep_ == "nsubj"]
                if subj:
                    print(subj[0], "-->", token)
            # We have a prepositional object with a preposition
            elif token.dep_ == "pobj" and token.head.dep_ == "prep":
                print(token.head.head, "-->", token)

Net income --> $9.4 million
the prior year --> $2.7 million
Revenue --> twelve billion dollars
a loss --> 1b


In [10]:
df = pd.read_csv("ClothingReviews.csv")
df.head()

Unnamed: 0,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Department Name,Class Name
0,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Intimate,Intimates
1,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,Dresses,Dresses
2,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,Dresses,Dresses
3,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,Bottoms,Pants
4,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,Tops,Blouses


In [11]:
df.dropna(subset=['Review Text'], inplace=True)

In [20]:
def np_tag(text):
    
    df = pd.DataFrame(columns = ['CHUNK'])
    
    doc = nlp(text)
    for chunk in doc.noun_chunks:
        df = df.append({'CHUNK': chunk.text}, ignore_index=True)
        
    return df

In [37]:
# Convert our reviews to lowercase to simplify our search
df["Review Text"] = df["Review Text"].str.lower()

# Find only reviews that have the word 'dress' in them
filter = df['Review Text'].str.contains('dress')
df_dress = df[filter].copy()
df_dress.shape

(7022, 8)

In [38]:
# Create an empty dataframe to store the results
df_np = pd.DataFrame(columns = ['CHUNK'])

# Iterate through the reviews and extra non-phrases for the reivews with "small or little"
df_np = np_tag(df_dress['Review Text'].to_string())
df_np.shape

(20825, 1)

In [39]:
# Show the top 10 noun phrases - notice that there are a lot of filler words (stop words)
df_np.groupby('CHUNK')['CHUNK'].count().\
    reset_index(name='count').sort_values(['count'],ascending=False).head(10)

Unnamed: 0,CHUNK,count
2238,i,5054
4476,this dress,2878
2262,it,1480
4417,this,948
3609,the dress,207
4660,this top,158
2374,love,144
2426,me,140
4959,you,134
3377,that,125


In [40]:
# As opposed to removing stop words, we can filter out rows in the dataframe
# that have the stop words.  This is a better way for noun phrases since we won't lose
# the context of the phrases during our prior extraction. 
filter = (df_np['CHUNK'].str.contains('this')) | \
         (df_np['CHUNK'].str.contains('the')) | \
         (df_np['CHUNK'].str.contains('that')) | \
         (df_np['CHUNK'].str.contains('my')) | \
         (df_np['CHUNK'].str.contains('a')) | \
         (df_np['CHUNK'].str.len() < 6)
df_np = df_np[-filter]

In [41]:
# Filter for words with spaces, so that we get only phrases with more than one word.
filter = (df_np['CHUNK'].str.contains(' '))
df_np = df_np[filter]

In [42]:
df_np.groupby('CHUNK')['CHUNK'].count().\
    reset_index(name='count').sort_values(['count'],ascending=False).head(10)

Unnamed: 0,CHUNK,count
330,full price,12
287,both colors,12
343,high hopes,7
580,very pretty dress,6
123,135 lbs,5
323,first sight,5
533,those dresses,4
427,one size,4
436,our store,3
243,4 dress,3
