#### Install spaCy

In [None]:
!pip install -U spacy --quiet

In [None]:
# Download the large English model for spaCy

!sudo python3 -m spacy download en_core_web_trf
# !python -m  spacy link --force en_core_web_lg en
!python -m spacy info en_core_web_trf

In [None]:
!python -m spacy info en_core_web_trf

In [None]:
# Download the large English model for spaCy
# https://spacy.io/models/en#en_core_web_lg
!sudo python3 -m spacy download en_core_web_lg
# !python -m  spacy link --force en_core_web_lg en
# !python -m spacy info en
!python -m spacy info en_core_web_lg

#### Removing Personal information

In [None]:
import spacy

In [None]:
# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

In [None]:
#Example text
text1 = 'My name is Allu Arjun'

In [None]:
#Parse the text using spaCy
doc = nlp(text1)

In [None]:
#Visualize dependecy parsing
from spacy import displacy

In [None]:
displacy.render(doc, style="ent", jupyter=True)

In [None]:
#Check Named entity Recognition
for token in doc:
    print(token, token.ent_type_)

Function to remove name

In [None]:
def remove_name(text, replacement_token='[NAME]'):

    #Parse the text
    doc = nlp(text)

    #Updated document
    updated_doc = []

    #Check Entity type
    for token in doc:
        print(token.ent_type_ )
        if token.ent_type_ == 'PERSON':
            updated_doc.append(replacement_token)
        else:
            updated_doc.append(str(token))

    return ' '.join(updated_doc)

In [None]:
new_doc = remove_name(text1)

In [None]:
new_doc = nlp(new_doc)

#Check Named entity Recognition
for token in new_doc:
    print(token, token.ent_type_)

In [None]:
remove_name('My name is Mukesh Kumar.')

In [None]:
displacy.render(nlp('My name is Mukesh Kumar.'), style="ent", jupyter=True)

##### How do we correct that?

In [None]:
#Lets check how many entities do we have?
doc = nlp('My name is Mukesh Kumar.')
i = 1
for ent in doc.ents:
    print(str(i) + '.',ent)
    i += 1

In [None]:
#How do we check if a token i.e word is beginning of a multiple words entity
for token in doc:
    #Lets check IOB value
    print(token, token.ent_iob_)

###ent_iob_ indicates the token’s position in the named entity:

* 'B': Beginning of the entity

* 'I': Inside the entity

* 'O': Outside the entity

So here, only the first token of a person's name is replaced by [NAME].

In [None]:
def remove_name2(text, replacement_token='[NAME]'):

    #Parse the text
    doc = nlp(text)

    #Updated document
    updated_doc = []

    #Check Entity type
    for token in doc:
        if token.ent_type_ == 'PERSON':
            if token.ent_iob_ == 'B':
                #Replace starting entity word
                updated_doc.append(replacement_token)
            else:
                #ignore
                pass
        else:
            updated_doc.append(str(token))

    return ' '.join(updated_doc)

In [None]:
remove_name2('My name is Mukesh Kumar.')

In [None]:
text = """
Allu Arjun still hasn’t found the right words that would explain how
he felt having dismissed Sachin Tendulkar for the batting legend’s first ever duck in first-class cricket.
The moment came in 2009, when representing Uttar Pradesh as a 19-year-old upcoming fast bowler,
Bhuvneshwar bowled a cutter that took an inside edge from the bat of Tendulkar
only to nestle into the safe hands of the fielder.
"""

print(remove_name2(text))

In [None]:
displacy.render(nlp(text), style="ent", jupyter=True)