# How to extract structured information from a text

In this tutorial, I illustrate how to extract structured information from a text through Python SpaCy.

In [189]:
import pandas as pd

with open('register.txt', 'r') as f:
    text = [line for line in f.readlines()]

In [190]:
df = pd.DataFrame(text,columns=['text'])
df.head()

Unnamed: 0,text
0,On August 21 1826 a son was born to John Bon a...
1,On June 11 1813 a daughter was born to James D...
2,On January 1 1832 a son was born to his father...


In [173]:
from spacy.matcher import Matcher 
import spacy

!python3.8 -m spacy download en_core_web_sm

In [191]:
import en_core_web_sm

text = df['text'][0]

nlp = en_core_web_sm.load()
doc = nlp(text)

On August 21 1826 a son was born to John Bon and named him Francis.

In [192]:
features = []
for token in doc:
    features.append({'token' : token.text, 'pos' : token.pos_})
fdf = pd.DataFrame(features)
fdf.head(len(fdf))


Unnamed: 0,token,pos
0,On,ADP
1,August,PROPN
2,21,NUM
3,1826,NUM
4,a,DET
5,son,NOUN
6,was,AUX
7,born,VERB
8,to,ADP
9,John,PROPN


## Extract Father

In [175]:
first_tokens = ['to', 'father']
last_tokens = ['and', 'naming']
pattern_father = [[{'LOWER' : {'IN' : first_tokens}},
           {'POS':'PROPN', 'OP' : '+'},
           {'LOWER': {'IN' : last_tokens}} ]]

In [176]:
def get_father(x):
    nlp = en_core_web_sm.load()
    doc = nlp(x)
    matcher = Matcher(nlp.vocab) 
    matcher.add("matching_father", pattern_father) 

    matches = matcher(doc)
    sub_text = ''    
    if(len(matches) > 0):
        span = doc[matches[0][1]:matches[0][2]] 
        sub_text = span.text
    tokens = sub_text.split(' ')
    
    name, surname = tokens[1:-1]
    return name, surname

In [177]:
new_columns = ['father name','surname']
for n,col in enumerate(new_columns):
    df[col] = df['text'].apply(lambda x: get_father(x)).apply(lambda x: x[n])


In [178]:
df

Unnamed: 0,text,father name,surname
0,On August 21 1826 a son was born to John Bon a...,John,Bon
1,On June 11 1813 a daughter was born to James D...,James,Donne
2,On January 1 1832 a son was born to his father...,David,Borne


## Extract Child

In [179]:
first_tokens = ['him', 'her']
last_tokens = ['.']
pattern_son = [[{'LOWER' : {'IN' : first_tokens}},
           {'POS':'PROPN', 'OP' : '+'},
           {'LOWER': {'IN' : last_tokens}} ]]

In [180]:
def get_child(x):
    nlp = en_core_web_sm.load()
    doc = nlp(x)
    matcher = Matcher(nlp.vocab) 
    matcher.add("matching_son", pattern_son) 

    matches = matcher(doc)
    sub_text = ''    
    if(len(matches) > 0):
        span = doc[matches[0][1]:matches[0][2]] 
        sub_text = span.text
    # remove punct
    sub_text = sub_text[:-1]
    tokens = sub_text.split(' ')
    
    return ' '.join(tokens[1:])

In [181]:
df['child'] = df['text'].apply(lambda x: get_child(x))

In [182]:
df

Unnamed: 0,text,father name,surname,child
0,On August 21 1826 a son was born to John Bon a...,John,Bon,Francis
1,On June 11 1813 a daughter was born to James D...,James,Donne,Mary Sarah
2,On January 1 1832 a son was born to his father...,David,Borne,John


## Extract Birth Date

In [183]:
def get_date(x):
    months={"January":"01","February":"02","March":"03","April":"04","May":"05","June":"06",
            "July":"07","August":"08","September":"09","October":"10","November":"11","December":"12",}
    tokens = x.split(" ")
    # month
    month = months[tokens[1]]
    # day
    day=tokens[2]
    if(len(day)==1):
        day="0"+day
    
    # year
    year = x.split(" ")[3]
    
    return (year+"-"+month+"-"+day)

In [184]:
df['date'] = df['text'].apply(lambda x: get_date(x))

In [185]:
df

Unnamed: 0,text,father name,surname,child,date
0,On August 21 1826 a son was born to John Bon a...,John,Bon,Francis,1826-08-21
1,On June 11 1813 a daughter was born to James D...,James,Donne,Mary Sarah,1813-06-11
2,On January 1 1832 a son was born to his father...,David,Borne,John,1832-01-01


## Extract Gender

In [186]:
def get_gender(x):
    if 'son' in x:
        return 'M'
    return 'F'

In [187]:
df['gender'] = df['text'].apply(lambda x: get_gender(x))

In [188]:
df

Unnamed: 0,text,father name,surname,child,date,gender
0,On August 21 1826 a son was born to John Bon a...,John,Bon,Francis,1826-08-21,M
1,On June 11 1813 a daughter was born to James D...,James,Donne,Mary Sarah,1813-06-11,F
2,On January 1 1832 a son was born to his father...,David,Borne,John,1832-01-01,M


In [171]:
df.to_csv('structured_register.csv')