### M5 Spacy Exercise

Import the Hep Dataset and using spacy perform the steps listed below to the text column. Add new columns to hold results of each operation


        Tokenise
        Identify all phrases in column that have the pattern - adjective/noun (only one instance in the text column)
        Remove all stopwords
        Remove all punctuation
        Remove all numbers
        Identify sentences
        Lemmatize the text
        Apply POS tagging
        Apply shallow parsing
        Apply Named Entity Recognition
        Apply Dependency Parsing
        
        
        
       

In [None]:
import spacy
import stanza
import pandas as pd
from spacy.matcher import Matcher
from spacy import displacy


In [None]:
# Load the language model instance in spaCy
nlp = spacy.load('../pre_course/spacy/small_practice_model/en_core_web_sm-2.3.1')

In [None]:
hep = pd.read_pickle("../data/Hep_Dataset.pkl")

In [None]:
# Function to change text column to string
def change_to_string(ptext):
    return " ".join(ptext)

In [None]:
hep['Text'] = hep['Text'].apply(change_to_string)

In [None]:
#keep a copy of the df 
hep2 = hep.copy()

### Tokenise

In [None]:
def token_spacy(pdoc):
    pdoc = nlp(pdoc)
    return [token.text for token in pdoc]

In [None]:
hep['token_spacy'] = hep['Text'].apply(token_spacy)

In [None]:
hep.head()

In [None]:
hep.loc[2, 'token_spacy']

### Identify all phrases in column that have the pattern - adjective/noun.

In [None]:

nlp = spacy.load('../pre_course/spacy/small_practice_model/en_core_web_sm-2.3.1')
matcher = Matcher(nlp.vocab)

doc = nlp(
    hep.loc[5, 'Text']
    )

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

### Remove all Stopwords

In [None]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
def stopword_spacy(pdoc):
    pdoc = nlp(pdoc)
    text = " ".join([str(token) for token in pdoc if not token.is_stop])
    return text

In [None]:
hep['preprocess_spacy'] = hep['Text'].apply(stopword_spacy)

In [None]:
hep.loc[5, 'preprocess_spacy']

### Remove all punctuation
        

In [None]:
def punctuation_spacy(pdoc):
    pdoc = nlp(pdoc)
    text = ""
    for token in pdoc:
        if not token.is_punct:
            text = text + " " + str(token)
    return text

In [None]:
hep['preprocess_spacy'] = hep['preprocess_spacy'].apply(punctuation_spacy)

In [None]:
hep.loc[5, 'preprocess_spacy' ]

### Remove all numbers

In [None]:
def nonumbers_spacy(pdoc):
    pdoc = nlp(pdoc)
    text = ""
    for token in pdoc:
        if token.is_alpha:
            text = text + " " + str(token)
    return text

In [None]:
hep['preprocess_spacy'] = hep['preprocess_spacy'].apply(nonumbers_spacy)

In [None]:
hep.loc[5, 'preprocess_spacy' ]

### Identify sentences

In [None]:
text = nlp("There is a green hill far away. It is in a land I heard in a lullaby")
sentences = list(text.sents)

for sentence in sentences:
    print(sentence, type(sentence))

In [None]:
def sentence_spacy(pdoc):
    pdoc = nlp(pdoc)
    psentences = list(pdoc.sents)
    return [sentence for sentence in psentences]

In [None]:
hep['sentence_spacy'] = hep['Text'].apply(sentence_spacy)

In [None]:
hep.loc[5, 'sentence_spacy']

### Lemmatize the text
        

In [None]:
def lemmatization_spacy(pdoc):
    
    pdoc =  nlp(pdoc)
    text  = ""
    
    for token in pdoc:
        text = text + " " + str(token.lemma_)
            
    return text

In [None]:
hep['preprocess_spacy'] = hep['preprocess_spacy'].apply(lemmatization_spacy)

In [None]:
hep.loc[5, 'preprocess_spacy']

### Apply POS tagging   

In [None]:
def pos_spacy(pdoc):
    
    pdoc = nlp(pdoc)
    pos = []
    
    for token in pdoc:
        pos.append([token.text, "-->", token.pos_])
 
    return pos

In [None]:
hep['pos_spacy'] = hep['Text'].apply(pos_spacy)

In [None]:
hep.loc[5, 'pos_spacy']

### Apply shallow parsing
       

In [None]:
def nounchunk_spacy(pdoc):
    
    pdoc =  nlp(pdoc)
    noun_chunks  = []
    
    for chunk in pdoc.noun_chunks:
        noun_chunks.append(chunk)
        
    return noun_chunks

In [None]:
hep['noun_chunks_spacy'] = hep['Text'].apply(nounchunk_spacy)

In [None]:
hep.loc[5,'noun_chunks_spacy']

### Apply Named Entity Recognition
       

In [None]:
def ne_spacy(pdoc):
    
    pdoc =  nlp(pdoc)
    named_entities  = []
    
    for entity in pdoc.ents:
        named_entities.append([entity.text, "--->", entity.label_] )
        
    return named_entities

In [None]:
hep['ne_spacy'] = hep['Text'].apply(ne_spacy)

In [None]:
hep.loc[5, 'ne_spacy']

### Apply Dependency Parsing

In [None]:
def depend_parse_spacy(pdoc):
    
    pdoc =  nlp(pdoc)
    de_parse  = []
    
    for token in pdoc:
        de_parse.append([token.text, "--->", token.dep_])
        
    return de_parse

In [None]:
hep['de_parse_spacy'] = hep['Text'].apply(depend_parse_spacy)

In [None]:
hep.loc[5, 'de_parse_spacy']