In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os

In [2]:
#libraries for text preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
import codecs
import spacy

Document Type: John Deer

In [3]:
#read in html document
j=codecs.open("data/424B2_a19-18449_4424b2.htm", 'r', 'utf-8')
john_deer= BeautifulSoup(j.read()).get_text()
john_deer = re.sub('[\s]*\n[\s]*','\n', john_deer)
john_deer = re.sub(' +',' ', john_deer) #replace multiple spaces with one space
john_deer = john_deer.strip() #Remove spaces at the beginning and at the end of the string
john_deer = john_deer.lower()
john_deer = str(john_deer)

#SpaCy NER
nlp=spacy.load('en',tagger=False,parser=False,matcher=False)
nlp.entity
#create a spacy doc object by passing John Deer document
doc = nlp(john_deer)
#print all found entities and their labels
for ent in doc.ents:
    print(ent.label_,ent.text)

In [4]:
#sentence tokenization
john_deer_sentences = [w.replace('\n','').replace('\xa0',' ') for w in sent_tokenize(john_deer)]
john_deer_sentences

['prospectus andpricing supplement no.',
 '34prospectus supplement, eachdated september 9, 2019dated april 7, 2017, as supplementedregistration statement no.',
 '333-217193by supplement no.',
 '1 dated june 27, 2018 andfiled pursuant to rule 424(b)(2)supplement no.',
 '2 dated january 4, 2019u.s. $8,800,000,000john deere capital corporationmedium-term notes, series gdue 9 months or more from date of issue$400,000,000 floating rate senior notes due june 13, 2022the medium-term notes offered hereby will be floating rate notes and senior securities as more fully described in the accompanying prospectus and prospectus supplement and will be denominated in u.s. dollars.',
 'cusip / isin:24422euz0 / us24422euz05date of issue*:september 12, 2019maturity date:june 13, 2022principal amount:$400,000,000interest rate basis:usd-libor-reuters(reuters page libor01)index maturity:3-monthspread:libor + 49 bpsday count:actual/360, adjustedinterest reset dates:quarterly on the 13th of march, june, septe

In [5]:
df_john_deer_sentences = pd.DataFrame(john_deer_sentences)

In [6]:
df_john_deer_sentences = df_john_deer_sentences[0].astype(str)

In [49]:
df_john_deer_sentences

0                  prospectus andpricing supplement no.
1     34prospectus supplement, eachdated september 9...
2                           333-217193by supplement no.
3     1 dated june 27, 2018 andfiled pursuant to rul...
4     2 dated january 4, 2019u.s. $8,800,000,000john...
5     cusip / isin:24422euz0 / us24422euz05date of i...
6     * pursuant to rule 15c6-1 under the securities...
7     accordingly, purchasers of the notes who wish ...
8     notice to prospective investors in the united ...
9     accordingly, such documents and/or materials a...
10    the communication of such documents and/or mat...
11    in the united kingdom, the notes offered hereb...
12    any person in the united kingdom that is not a...
13    notice to prospective investors in the europea...
14    this pricing supplement has been prepared on t...
15    accordingly any person making or intending to ...
16    neither the issuer nor the agents have authori...
17    the expression prospectus directive mean

In [47]:
df_john_deer_senteces = df_john_deer_sentences[0:17,].replace(': ',':').replace(' / ','/').replace('     ','-')


In [48]:
df_john_deer_sentences[5]

'cusip / isin:24422euz0 / us24422euz05date of issue*:september 12, 2019maturity date:june 13, 2022principal amount:$400,000,000interest rate basis:usd-libor-reuters(reuters page libor01)index maturity:3-monthspread:libor + 49 bpsday count:actual/360, adjustedinterest reset dates:quarterly on the 13th of march, june, september and december, commencing on september 12, 2019 and ending on the maturity dateinterest determination dates:two london business days preceding such interest reset dateinterest payment dates:quarterly on the 13th of march, june, september and december, commencing on december 13, 2019 and ending on the maturity dateminimum interest rate:0.000%day count convention:modified following, adjustedredemption provision:noneprice to public:100.000% plus accrued interest, if any, from september 12, 2019plan of distribution:nameprincipal amount of notescitigroup global markets inc.$90,000,000deutsche bank securities inc.90,000,000hsbc securities (usa) inc.90,000,000j.p. morgan 

#### Maturity Date keywords: 'maturity date', 'stated maturity', 'stated maturity date'
#### Original face amount: 'face value', 'principal amount', 'aggregate principal amount offered'

In [8]:
maturity_date_chunk = df_john_deer_sentences[5].replace(': ',':').replace(' / ','/').replace('     ','-').split('--')

In [9]:
maturity_date_chunk

['cusip/isin:24422euz0/us24422euz05date of issue*:september 12, 2019maturity date:june 13, 2022principal amount:$400,000,000interest rate basis:usd-libor-reuters(reuters page libor01)index maturity:3-monthspread:libor + 49 bpsday count:actual/360, adjustedinterest reset dates:quarterly on the 13th of march, june, september and december, commencing on september 12, 2019 and ending on the maturity dateinterest determination dates:two london business days preceding such interest reset dateinterest payment dates:quarterly on the 13th of march, june, september and december, commencing on december 13, 2019 and ending on the maturity dateminimum interest rate:0.000%day count convention:modified following, adjustedredemption provision:noneprice to public:100.000% plus accrued interest, if any, from september 12, 2019plan of distribution:nameprincipal amount of notescitigroup global markets inc.$90,000,000deutsche bank securities inc.90,000,000hsbc securities (usa) inc.90,000,000j.p. morgan sec

In [10]:
re.search(r'(maturity ?(\w*):(\w*) (\d*), (\d*))', 
                 df_john_deer_sentences[5])

<re.Match object; span=(70, 97), match='maturity date:june 13, 2022'>

In [11]:
re.search(r'(principal amount([' '])?(\w*)?([' '])?(\w*)?([' '])?(\w*)?(\:)([' '])?(\w*)?([' '])?(\$)(\d*),(\d*)?,?(\d*)?,?(\d*)?([' '])?(per note)?)', df_john_deer_sentences[5])


<re.Match object; span=(97, 115), match='principal amount:$'>

In [12]:
re.search(r'(principal amount(\:)(\$)(\d*),(\d*)?,?(\d*)?,?(\d*))', df_john_deer_sentences[5])


<re.Match object; span=(97, 126), match='principal amount:$400,000,000'>

Document Type: Wells Fargo

In [13]:
#read in html document
j=codecs.open("data/424B2_d833231d424b2.htm", 'r', 'utf-8')
wells= BeautifulSoup(j.read()).get_text()
wells = re.sub('[\s]*\n[\s]*','\n', wells)
wells = re.sub(' +',' ', wells) #replace multiple spaces with one space
wells = wells.strip() #Remove spaces at the beginning and at the end of the string
wells = wells.lower()
wells = str(wells)

In [14]:
#sentence tokenization
wells_sentences = [w.replace('\n','').replace('\xa0',' ') for w in sent_tokenize(wells)]
wells_sentences

['definitive pricing supplement no.',
 '9filed pursuant to rule 424(b)(2)registration no.',
 '333-216234pricing supplement no.',
 '9 dated february 4, 2020(to prospectus supplement dated april 7, 2017and prospectus dated march 14, 2019)wells fargo & companymedium-term notes, series qsenior redeemable fixed-to-floating rate notesyou should read the more detailed description of the notes provided under \x93description of notes\x94 in the accompanyingprospectus supplement and \x93description of debt securities\x94 in the accompanying prospectus, as supplemented by this pricing supplement.',
 'all payments on the notes are subject to the credit risk of wells fargo & company.',
 'if wellsfargo & company defaults on its obligations, you could lose some or all of your investment.',
 'certain defined terms used but not defined herein have the meanings set forth in the accompanying prospectus supplement and prospectus.',
 'aggregate principal amount offered:$3,000,000,000trade date:february 4, 

In [15]:
df_wells_sentences = pd.DataFrame(wells_sentences)

In [18]:
df_wells_sentences = df_wells_sentences[0].astype(str)

In [20]:
wells_chunk = df_wells_sentences[7].replace(': ',':').replace(' / ','/').replace('     ','-').split('--')
wells_chunk

['aggregate principal amount offered:$3,000,000,000trade date:february 4, 2020original issue date:february 11, 2020 (t+5)stated maturity date:february 11, 2026; on the stated maturity date, the holders of the notes will be entitled to receive a cash payment inu.s. dollars equal to 100% of the principal amount of the notes plus any accrued and unpaid interestoptional redemption:at our option, we may redeem the notes, in whole at any time or in part from time to time on any day included in themake-whole redemption period, at a redemption price equal to the sum of:(i) 100% of the principal amount of the notes being redeemed plus accrued and unpaid interest thereon, to, but excluding, the make-whole redemption date and (ii) themake-whole amount, as described under \x93description of debt securities\x97redemption and repayment\x97optional make-whole redemption of debt securities\x94 in the accompanying prospectus.']

In [84]:
re.search(r'((\w*)?([' '])?(principal amount)([' '])?(\w*)?(\:\$)(\d*)(\,)?(\d*)?(\,)?(\d*)?(\,)?(\d*)?)', df_wells_sentences[7])


<re.Match object; span=(20, 49), match='amount offered:$3,000,000,000'>

In [83]:
re.search(r'((principal amount)([' '])(\w*)(\:\$))', df_wells_sentences[7])

error: unterminated character set at position 20

In [73]:
re.search(r'((principal amount)([' '])(\w*)(\:)(\$))', df_wells_sentences[7])

error: unterminated character set at position 20

In [66]:
re.search(r'((\w*)?([' '])?principal(\:)(\$)(\d*)(\,)?(\d*)?(\,)?(\d*)?(\,)?(\d*)?)', df_wells_sentences[7])


error: unterminated character set at position 8