In [173]:
# Pandas
import pandas as pd

# Import dependencies
from PyPDF2 import PdfFileReader

# spaCY for NLP
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

# Import Heapq 
from heapq import nlargest

# Import HuggingFace Transformer
from transformers import pipeline


# Import Google's T5 Model Transformer
from transformers import TFAutoModelWithLMHead, AutoTokenizer


# NLTK for splitting sentences
import nltk.data

#textblob
from textblob import TextBlob


# Incompatabilties with spaCy 3.0
from spacytextblob.spacytextblob import SpacyTextBlob


#Regex
import re

# Pretty Print
import pprint

In [174]:
# Read the sample file and create the PDF Object

pdf_path='data/sample.pdf'
pdf = PdfFileReader(str(pdf_path))



#### Preview PDF

In [175]:
def _embed_pdf_html(pdf, size):
    return '<iframe src={0} width={1[0]} height={1[1]}></iframe>'.format(pdf, size)

In [176]:
print(_embed_pdf_html('./data/sample.pdf',size=(300,250)))

<iframe src=./data/sample.pdf width=300 height=250></iframe>


In [177]:
# total num of pages in PDF
print(pdf.numPages)

19


In [178]:
# Get 5th Page. PDF is 0-indexed, so 5th page is at 4th index
pdf.getPage(4).extractText()

'4Rewriting the rules: Succeeding in the new retail banking landscape \nWithin any specific market, of course, there are \nbanks that have acted swiftly to adopt digital and \nremote as their main channel for interactions, and \n\nthose that have lagged behind (Exhibit 4, page 6). One UK bank makes more than 50 percent of sales \n\nthrough digital channels, well ahead of market peers. \n\nEven in the historically branch-dependent small \nbusiness segment, a range of banks and fintechs are \n\nfinding that remote value propositions delivered \n\ndigitally (e.g., remote relationship managers at Nordic \n\nbanks) are attractive to small businesses. \nBanks that are ahead of the curve in terms of \ncapitalizing on this shift are pulling away from the \n\npack and have taken decisive actions on several \nfronts:  \n  Set a bold aspiration for sales/service channel \nmix. Banks must do more than react to shifts \nin consumer preferences—they need to set \n\naspirational targets for sales and

## Read and Extract PDF

In [179]:
with open('data/extract_2.txt',mode="w") as output_file:
    for page in pdf.pages:
        text = page.extractText()
        
        output_file.write(text)

## Clean Text

In [180]:
def clean_text(filename, isFile = True):
    if isFile:
        fp = open(filename)
        data = fp.read().replace('\n', '')
    else:
        data = filename
        
    return re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", data).strip()

### Sampled Sentences

In [181]:
document1 ="""Succeeding in the new retail banking landscape Retail banks have long competed on distribution, realizing economies of scale through network effects and investments in brand and infrastructure. But even those scale economies had limits above a certain size. As a result, in most retail banking markets, a few large institutions, operating at similar efficiency ratios, dominate market share. Changes to the retail banking business model have mostly come in response to regulatory shifts, as opposed to a purposeful reimagining of what the winning bank of the future will look like. Retail banks have also not kept pace with the improvements in customer experience seen in other consumer industries. Few banks stand out for innovation in customer interaction models or branch formats. Marketing investments have traditionally focused on brand building and increasing loyalty: a reputable brand stood for trust and security and became a moat, providing protection against new entrants to the sector. Finally, most banks offer similar products, with limited innovation in features. Today, the moats that banks have built are more likely to restrict their own progress than protect them from attackers. Four shifts are reshaping the global retail banking landscape to the point where banks need to fundamentally rethink what it takes to compete and win. This should be an urgent priority for banks. The pace of change will likely accelerate, with a select set of large-scale winners emerging in the next three to five years that will gain share in their core markets and begin to compete across borders, leaving many sub-scale institutions scrambling for relevance. Rewriting the rules: Succeeding in the new retail banking landscape 2Rewriting the rules: Succeeding in the new retail banking landscape Four shifts are reshaping retail banking Four secular shifts are changing the way retail banks will compete in the coming years:    The traditional distribution-led growth formula is losing relevance, with a breakdown in the relationship between branch footprint and growth. Banks are now competing on customer experience, with leaders growing faster than their competitors.    Scale economics are back—banks that excel at deploying new technologies (e.g., automation and machine learning) and those with a more digital-centric channel mix will have a structural cost advantage.    The retail banking relationship is getting unbundled along product lines, fueled by digitization and, increasingly, by regulation— and the biggest profit pools are under attack. A few banks will be able to temper and reverse this trend, using superior data and analytics capabilities to build deeper and broader relationships with their customers. In the near term, these shifts will combine to intensify competition as new entrants and incumbents seek to push past traditional boundaries. More than 100 digital attacker banks have launched in the last few years globally— including N26, Monzo and Revolut in Europe; WeBank, Digibank, Jenius in Asia; SoFi, Marcus, Moven in the US; Nubank in South America— while many incumbents have launched or are considering launching digital-first models.  Over the next three to five years, we expect a few players to emerge from this competitive scrum to gain dominant share in their core markets and possibly beyond. These firms will have taken bold and decisive actions to capitalize on the shifts that are reshaping the industry. In some cases, these winners will be incumbents that build on an already significant share; in others, they will be institutions newer to the banking industry, which use their agility, strategic aggressiveness, and sharp execution to attract customers.   1. The traditional distribution-led growth formula no longer applies Until the financial crisis in 2007, a retail bank’s total share of deposits was tightly linked to the size of its branch network. Even as internet use grew rapidly, customers still visited branches for account servicing and to learn about and purchase new products. The physical branch created a sense of security and trust. Banks with higher branch density benefited from a network effect—the more branches, the higher the likelihood of acquiring and retaining customers. Over the past decade, this relationship between deposit growth and branch density has weakened. Deposits at the 25 largest US retail banks have doubled over the past decade, while their combined branch footprint shrank by 15 percent over the same period. This reverse correlation is even sharper for the top five US banks—while reducing branches by 15 percent, they increased deposits by 2.6 times ( Exhibit 1). While there have been previous periods of branch contraction, they were clearly tied to economic downturns; this most recent wave of retrenchment commenced about a decade ago and has persisted through a period of robust economic growth. Retail banking branch networks are contracting across North America, the UK, and Europe (Exhibit 2, page 4), although the pace of change varies considerably between regions. Those that are ahead of the curve have reduced branches by as much as 71 percent (Netherlands). Banks in North America and Southern Europe are 3Rewriting the rules: Succeeding in the new retail banking landscape reducing branches and growing digital sales at a more gradual rate. In many Asian, African, and Latin American countries, branch reduction is not so apparent—only because retail banks in these markets leapfrogged branch distribution to go directly to digital sales. The rate of branch reduction is often tied to customer willingness to purchase banking products online or on mobile devices. Eighty to ninety percent of banking customers in the Nordics, for example, are open to digital product purchases for most financial products, compared to 50 to 60 percent in North America and Southern Europe ( Exhibit 3, page 5). While customer willingness to purchase products via digital channels varies, however, the common thread is that in all markets this readiness is far ahead of actual digital sales and will require banks to catch up to consumer needs and expectations."""

In [182]:
document2 = """Customer experience is beginning to gener- ate meaningful separation in growth Across all retail businesses—including banks— customers now expect interactions to be simple, intuitive, and seamlessly connected across physical and digital touchpoints.  Banks are investing in meeting these expectations but have struggled to keep pace—focusing on the less-than-lofty goal of making the experience “less bad” for customers, rather than “outstanding.” Many banks are hampered by legacy IT infrastructures and siloed data. As a result, few banks are true leaders in terms of customer experience. Even for leading institutions, typically only one-half to two-thirds of customers rate their experience as excellent. This holds true across product categories, including those such as cards and deposits that have higher digital adoption rates.  The impact of this less-than-stellar performance is measurable. For example, McKinsey analysis shows that in the US, top-quartile banks in terms of experience have had meaningfully higher deposit growth over the past three years ( Exhibit 5, page 8). Simply being good does not move the needle; customer impact really shows when banks offer outstanding experiences. The few “experience leaders” emerging in retail banking are generating higher growth than their peers by attracting new customers and deepening relationships with their existing customer base. Highly satisfied customers are two-and-a-half times more likely to open new accounts/products with their existing bank than those who are merely satisfied; they are also less price sensitive and generate positive word of mouth. These experience leaders are adopting tactics pioneered by digital-native companies in other sectors such as ecommerce, travel, and entertainment: setting a “north star” based on proven markers of differentiated experience (e.g., UX design, carrying context across channels), redesigning journeys that matter most for digital- first customers and not just digital-only customers, and establishing integrated real-time measurement that cuts across products, channels, and employees. These banks know that customer experience is not just about the front- end look and feel, but that it requires discipline, focus, and investment in the following actions:    Focus on the journeys and sub-journeys that matter: The relative contribution of sub- journeys (e.g., app downloading; activating account) in determining overall customer experience varies considerably. In fact, ten to fifteen sub-journeys have the biggest customer satisfaction impact for most products and should thus be the first priority. For instance, when opening a new deposit account, the researching options sub-journey has eight times the impact on customer satisfaction than other account-opening sub- journeys, on average ( Exhibit 6, page 9). For banks, the key is to prioritize these high- impact sub-journeys and systematically redesign them from scratch—a process that can take about three to four months and result in at least a 15 to 20 percent lift in customer satisfaction.   Change the way you engage with customers. Experience leaders understand that digitization is not just about creating a cutting-edge online and mobile experience, and that satisfaction is shaped by customer experience across channels.1 The experience should be seamless, especially on journeys that are more likely to take place over multiple channels, such as new account opening, financial 1  Walter Rizzi and Zubin Taraporevala, “The balancing act: Omnichannel excellence in retail banking,” McKinsey.com, January 2019. 8Rewriting the rules: Succeeding in the new retail banking landscape advice, or issue resolution. One wealth manager equipped its front-line relationship managers with robo-advice algorithms that are in sync with what customers see on the self- directed channel—and provided the RMs with daily and weekly next-best-action recommendations to nudge their clients. Banks need to deploy these tools broadly and empower their front-line staff to play a more consultative role that blends human and digital recommendations. They will also need to revisit how these employees are incentivized, shifting to a longer-term view of relationships and profitability rather than just product sales. Translate data into personalized products and real-time offers. The amount of data available on individual customers or prospects has exploded in recent years. The challenge is to convert this data into actionable nudges and highly relevant offers for customers that are delivered at the right moment. Credit card companies have long offered discounts on specific spending categories or with specific retailers. Today, however, they can improve loyalty and share of spending by providing location-specific offers right when a customer enters a coffee shop, movie theater, or car dealership. South Real differences in customer satisfaction 1CSAT (Percent of customers rating 9 or 10) Leaders in customer experience are growing faster Deposit CAGR (2014-17) 65554939-26 5.93.2 46% Top quartile CSAT Bottom quartile CSAT Bottom quartile 3rdquartile 2nd quartile Top quartile Exhibit 5 1 Percentage of respondents that selected a 9 or 10 on a 10-point customer satisfaction scale. Question: “We would like to understand your experience with [product] with [Bank]. Overall, how satised or dissatised are you with [product] with [Bank]? ”Source: McKinsey 2018 Retail Banking Customer Experience Benchmark Survey. US retail banks with better customer experience are growing deposits faster. 9Rewriting the rules: Succeeding in the new retail banking landscape Africa’s Discovery, as an example, is launching a bank with product features that are informed by behavioral science and incentive design research (e.g., dynamic interest rates for savings and credit products that are tied to healthy financial behavior). 3. Productivity gains and returns to scale are back Larger retail banks have historically tended to be more efficient than their smaller competitors, benefiting from distribution network effects and the shared overhead for IT, infrastructure, and other shared services. Our analysis of over 3,000 515 01020253040 3545 5050055606570Importance Relative importance % 3Researching options Deposit sub-journeys 1Downloading the app Activating the account Learning about account features Completing the application Performance Journey SAT (top 2 boxes) 2Exhibit 6 1 Deposit example journeys shown; trend also holds true for credit card and mortgage account-opening sub-journeys. 2 Customer satisfaction with individual sub-journeys. Represents percentage of customers that rated the sub-journey 9 or 10 (on 10-point scale). 3 Relative importance is the amount that any individual sub-journey impacts the total journey score; calculated using a variance decomposition approach. Source: McKinsey 2018 North American Retail Banking Journey Benchmarks Banks should prioritize the sub-journeys the have the most impact on customer experience. 10Rewriting the rules: Succeeding in the new retail banking landscape banks around the globe shows that while there is variation across countries, larger institutions tend to be more efficient both in terms of cost-to-asset (Exhibit 7) and cost-to-income ratios. However, beyond a certain point, even larger institutions struggle to eke out efficiencies or realize benefits from scale. For example, even after several years of cost-cutting after the most recent financial crisis, large US retail banks have not made material improvements in productivity over the past decade (Exhibit 8).   We expect this paradigm to change over the next few years, as structural improvements in efficiency ratios and increasing returns to scale enable some large banks to become even more efficient. The reason is two-fold: first, advances in technologies such as robotic process automation, machine learning, and cognitive artificial intelligence—many of which are now mainstream and commercially viable—are unleashing a new wave of productivity improvements for financial institutions."""

# Using spaCy

In [183]:
def generate_summary(input_text, mode="FAST"):
    
    if mode is "ACCURACY": # Use different models, accurate models are slower 
        nlp = spacy.load('en_core_web_trf')
    else:
        nlp = spacy.load('en_core_web_sm')
        
        
    doc_obj = nlp(input_text)
    stopwords = list(STOP_WORDS)

    word_freq = {}  
    for word in doc_obj:  
        if word.text not in stopwords:
            if word.text not in word_freq.keys():
                word_freq[word.text] = 1
            else:
                word_freq[word.text] += 1


    max_freq = max(word_freq.values())

    for word in word_freq.keys():  
        word_freq[word] = (word_freq[word]/max_freq)

    # Sentence Tokens
    sentence_list = [ sentence for sentence in doc_obj.sents ]

    # Calculate Sentence Score and Ranking
    sentence_scores = {}  
    for sent in sentence_list:  
        for word in sent:
            if word.text.lower() in word_freq.keys():
                if len(sent.text.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_freq[word.text.lower()]
                    else:
                        sentence_scores[sent] += word_freq[word.text.lower()]

    # Find N Largest
    summary_sentences = nlargest(7, sentence_scores, key=sentence_scores.get)
    final_sentences = [ w.text for w in summary_sentences ]
    summary = ' '.join(final_sentences)
    
    print("Original Text\n")
    print(input_text)
    print("\n Original Total Length:",len(input_text))
    print('\n\nSummarized Text\n')
    print(summary)
    print("\nSummary Total Length:",len(summary))

## Summarization

In [184]:
generate_summary(document1, "FAST")

Original Text

Succeeding in the new retail banking landscape Retail banks have long competed on distribution, realizing economies of scale through network effects and investments in brand and infrastructure. But even those scale economies had limits above a certain size. As a result, in most retail banking markets, a few large institutions, operating at similar efficiency ratios, dominate market share. Changes to the retail banking business model have mostly come in response to regulatory shifts, as opposed to a purposeful reimagining of what the winning bank of the future will look like. Retail banks have also not kept pace with the improvements in customer experience seen in other consumer industries. Few banks stand out for innovation in customer interaction models or branch formats. Marketing investments have traditionally focused on brand building and increasing loyalty: a reputable brand stood for trust and security and became a moat, providing protection against new entrants to

In [185]:
#generate_summary(document2, "ACCURACY")

## Named Entity Recognition

In [186]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(document1)
_table = []

for ent in doc.ents:
    _row = []
    _row.append(ent.text)
    _row.append(ent.start_char)
    _row.append(ent.end_char)
    _row.append(ent.label_)
    
    _table.append(_row)
    #print(ent.text, ent.start_char, ent.end_char, ent.label_)

pd.DataFrame(_table, columns=["TEXT", "START_CHAR", "END_CHAR", "LABEL"])

Unnamed: 0,TEXT,START_CHAR,END_CHAR,LABEL
0,Today,1079,1084,DATE
1,Four,1199,1203,CARDINAL
2,the next three to five years,1491,1519,DATE
3,2Rewriting,1730,1740,CARDINAL
4,Four,1799,1803,CARDINAL
5,Four,1840,1844,CARDINAL
6,the coming years,1910,1926,DATE
7,Scale,2177,2182,GPE
8,More than 100,2871,2884,CARDINAL
9,the last few years,2925,2943,DATE


# Using HuggingFace Transformers (SoA)

In [187]:
# Initialize the HuggingFace summarization pipeline
summarizer = pipeline("summarization")


All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


# Read PDF Line-by-line and SUMMARIZE

In [108]:
# Be careful before running, this - this cell takes time!

page_count = 1

for page in pdf.pages:
    text = page.extractText()
    
    #Config the HF Summarization pipeline
    summarized_text = summarizer(text, min_length=75, max_length=300)
    
    # Print summarized text
    print("\n---- Page No. " + str(page_count) + "----")
    print ("Original Text Length: " + str(len(text)))
    print("Original Text: " + text)
    print ("--")
    print ("Summarized Text Length: " + str(len(summarized_text[0]['summary_text'])))
    print(summarized_text[0]['summary_text'])
    page_count += 1

Your max_length is set to 300, but you input_length is only 44. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)



---- Page No. 1----
Original Text Length: 166
Original Text: Rewriting the rules: Succeeding in the  
new retail banking  

landscape  Authored by: 
Vaibhav Gujral 
Nick Malik 
Zubin Taraporevala  
Global Banking  February 2019
--
Summarized Text Length: 286
rewriting the rules: Succeeding in the new retail banking landscape Authored by: Vaibhav Gujral Nick Malik Zubin Taraporevala Global Banking February 2019 . the rules are based on the rules of the retail banking industry . it is the first time the banking landscape has been rewritten .


Token indices sequence length is longer than the specified maximum sequence length for this model (714 > 512). Running this sequence through the model will result in indexing errors



---- Page No. 2----
Original Text Length: 1809
Original Text: 1Rewriting the rules: Succeeding in the new retail banking landscape 
Retail banks have long competed on distribution, realizing economies of scale through network 

effects and investments in brand and 

infrastructure. But even those scale economies 

had limits above a certain size. As a result, in 

most retail banking markets, a few large 

institutions, operating at similar efficiency ratios, 

dominate market share. Changes to the retail 

banking business model have mostly come in 
response to regulatory shifts, as opposed to a 

purposeful reimagining of what the winning bank 

of the future will look like.  
Retail banks have also not kept pace with the improvements in customer experience seen in 

other consumer industries. Few banks stand out 
for innovation in customer interaction models or 
branch formats. Marketing investments have 
traditionally focused on brand building and increasing loyalty: a reputable b

Your max_length is set to 300, but you input_length is only 48. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)



---- Page No. 18----
Original Text Length: 2282
Original Text: 17Rewriting the rules: Succeeding in the new retail banking landscape 
The path forward
In the near term, competition in retail banking is 
likely to intensify, with only a few banks emerg-

ing as winners. These banks will take bold steps 
now to establish a formidable position that fends 
off new entrants and smaller attackers. Retail 

banking leaders should consider and debate the 
following questions as they face the challenges 
of the coming years:  1.What is our strategy to increase customer 
growth over the next three to five years, not 

just protect existing share of wallet?  
2.How bold can we be with our distribution 
plan, and will it be effective two to three years 

from now?  
3.Are we on track to deliver a superior customer 
experience in the next 12 months? What ca-
pabilities will differentiate us from the compe-

tition?  4.How can we deploy new technologies to re-
duce our cost structure? Are we buildi

# Use Google T5 Model

In [188]:
from transformers import TFAutoModelWithLMHead, AutoTokenizer
model = TFAutoModelWithLMHead.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [189]:
# T5 uses a max_length of 512 so we cut the article to 512 tokens.
inputs = tokenizer.encode("summarize: " + document1, return_tensors="tf", max_length=512)
outputs = model.generate(inputs, max_length=250, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [190]:
# Decoding and printing summary

GT5_summary=tokenizer.decode(outputs[0], skip_special_tokens=True)
print(GT5_summary)

four shifts are reshaping the global retail banking landscape. banks need to fundamentally rethink what it takes to compete and win. banks are now competing on customer experience, with leaders growing faster.


In [191]:
summary_text_t5 = ''

for ids in outputs:
    summary_text_t5 += tokenizer.decode(ids, skip_special_tokens=True)

print("Original: " + document1)
print("\n\nSummarized T5:" + summary_text_t5)

Original: Succeeding in the new retail banking landscape Retail banks have long competed on distribution, realizing economies of scale through network effects and investments in brand and infrastructure. But even those scale economies had limits above a certain size. As a result, in most retail banking markets, a few large institutions, operating at similar efficiency ratios, dominate market share. Changes to the retail banking business model have mostly come in response to regulatory shifts, as opposed to a purposeful reimagining of what the winning bank of the future will look like. Retail banks have also not kept pace with the improvements in customer experience seen in other consumer industries. Few banks stand out for innovation in customer interaction models or branch formats. Marketing investments have traditionally focused on brand building and increasing loyalty: a reputable brand stood for trust and security and became a moat, providing protection against new entrants to the 

In [75]:
page_count = 1

'''

for page in pdf.pages:
    text = page.extractText()
    
    #Config the HF Summarization pipeline
    summarized_text = summarizer(text, min_length=75, max_length=300)
  
    # Setting max_length of 512 for t5-so we cut the article to 512 tokens.
    chunked_input = tokenizer.encode("summarize: " + text, return_tensors="tf", max_length=512)
    summarized_output = model.generate(chunked_input, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    
    
    # Print summarized text
    print("\n---- Page No. " + str(page_count) + "----")
    print ("Original Text Length: " + str(len(text)))
    print("Original Text: " + text)
    print ("--")
    print ("Summarized Text Length: " + str(len(summarized_output)))
    print(summarized_output)
    page_count += 1
    
'''

Your max_length is set to 300, but you input_length is only 44. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)



---- Page No. 1----
Original Text Length: 166
Original Text: Rewriting the rules: Succeeding in the  
new retail banking  

landscape  Authored by: 
Vaibhav Gujral 
Nick Malik 
Zubin Taraporevala  
Global Banking  February 2019
--
Summarized Text Length: 1
[{'summary_text': 'rewriting the rules: Succeeding in the new retail banking landscape Authored by: Vaibhav Gujral Nick Malik Zubin Taraporevala Global Banking February 2019 . the rules are based on the rules of the retail banking industry . it is the first time the banking landscape has been rewritten .'}]

---- Page No. 2----
Original Text Length: 1809
Original Text: 1Rewriting the rules: Succeeding in the new retail banking landscape 
Retail banks have long competed on distribution, realizing economies of scale through network 

effects and investments in brand and 

infrastructure. But even those scale economies 

had limits above a certain size. As a result, in 

most retail banking markets, a few large 

institutions, operatin

Your max_length is set to 300, but you input_length is only 48. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)



---- Page No. 18----
Original Text Length: 2282
Original Text: 17Rewriting the rules: Succeeding in the new retail banking landscape 
The path forward
In the near term, competition in retail banking is 
likely to intensify, with only a few banks emerg-

ing as winners. These banks will take bold steps 
now to establish a formidable position that fends 
off new entrants and smaller attackers. Retail 

banking leaders should consider and debate the 
following questions as they face the challenges 
of the coming years:  1.What is our strategy to increase customer 
growth over the next three to five years, not 

just protect existing share of wallet?  
2.How bold can we be with our distribution 
plan, and will it be effective two to three years 

from now?  
3.Are we on track to deliver a superior customer 
experience in the next 12 months? What ca-
pabilities will differentiate us from the compe-

tition?  4.How can we deploy new technologies to re-
duce our cost structure? Are we buildi

# ABSA with spaCy

### Extract Sentences from the Text

In [192]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [193]:
# cleaned_data = clean_text("extract_2.txt") 

# Clean the Text 
cleaned_data = clean_text(document1, False) 

# print('\n-\n'.join(nltk.tokenize.sent_tokenize(cleaned_data)))

sentences = nltk.tokenize.sent_tokenize(cleaned_data)

print(sentences)

['Succeeding in the new retail banking landscape Retail banks have long competed on distribution, realizing economies of scale through network effects and investments in brand and infrastructure.', 'But even those scale economies had limits above a certain size.', 'As a result, in most retail banking markets, a few large institutions, operating at similar efficiency ratios, dominate market share.', 'Changes to the retail banking business model have mostly come in response to regulatory shifts, as opposed to a purposeful reimagining of what the winning bank of the future will look like.', 'Retail banks have also not kept pace with the improvements in customer experience seen in other consumer industries.', 'Few banks stand out for innovation in customer interaction models or branch formats.', 'Marketing investments have traditionally focused on brand building and increasing loyalty: a reputable brand stood for trust and security and became a moat, providing protection against new entran

### Split the sentences to get target aspects

In [194]:
for sentence in sentences:
  doc = nlp(sentence)
  for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
      token.pos_,[child for child in token.children])

Succeeding advcl competed VERB VERB [in]
in prep Succeeding VERB ADP [landscape]
the det landscape NOUN DET []
new amod landscape NOUN ADJ []
retail amod banking NOUN ADJ []
banking compound landscape NOUN NOUN [retail]
landscape pobj in ADP NOUN [the, new, banking]
Retail amod banks NOUN ADJ []
banks nsubj competed VERB NOUN [Retail]
have aux competed VERB AUX []
long advmod competed VERB ADV []
competed ROOT competed VERB VERB [Succeeding, banks, have, long, on, ,, realizing, .]
on prep competed VERB ADP [distribution]
distribution pobj on ADP NOUN []
, punct competed VERB PUNCT []
realizing advcl competed VERB VERB [economies, through]
economies dobj realizing VERB NOUN [of]
of prep economies NOUN ADP [scale]
scale pobj of ADP NOUN []
through prep realizing VERB ADP [effects]
network compound effects NOUN NOUN []
effects pobj through ADP NOUN [network, and, investments, in]
and cc effects NOUN CCONJ []
investments conj effects NOUN NOUN []
in prep effects NOUN ADP [brand]
brand pobj

### get sentence descriptions

In [195]:
for sentence in sentences:
  doc = nlp(sentence)
  descriptive_term = ''
  for token in doc:
    if token.pos_ == 'ADJ':
      prepend = ''
      for child in token.children:
        if child.pos_ != 'ADV':
          continue
        prepend += child.text + ' '
      descriptive_term = prepend + token.text
  print(sentence)
  print(descriptive_term)

Succeeding in the new retail banking landscape Retail banks have long competed on distribution, realizing economies of scale through network effects and investments in brand and infrastructure.
Retail
But even those scale economies had limits above a certain size.
certain
As a result, in most retail banking markets, a few large institutions, operating at similar efficiency ratios, dominate market share.
similar
Changes to the retail banking business model have mostly come in response to regulatory shifts, as opposed to a purposeful reimagining of what the winning bank of the future will look like.
purposeful
Retail banks have also not kept pace with the improvements in customer experience seen in other consumer industries.
other
Few banks stand out for innovation in customer interaction models or branch formats.
Few
Marketing investments have traditionally focused on brand building and increasing loyalty: a reputable brand stood for trust and security and became a moat, providing prote

### identify the targets/aspects

In [196]:
aspects = []
for sentence in sentences:
  doc = nlp(sentence)
  descriptive_term = ''
  target = ''
  for token in doc:
    if token.dep_ == 'nsubj' and token.pos_ == 'NOUN':
      target = token.text
    if token.pos_ == 'ADJ':
      prepend = ''
      for child in token.children:
        if child.pos_ != 'ADV':
          continue
        prepend += child.text + ' '
      descriptive_term = prepend + token.text
  aspects.append({'aspect': target,
    'description': descriptive_term})


#print(aspects)
pprint.pprint(aspects)

[{'aspect': 'banks', 'description': 'Retail'},
 {'aspect': 'economies', 'description': 'certain'},
 {'aspect': 'institutions', 'description': 'similar'},
 {'aspect': 'bank', 'description': 'purposeful'},
 {'aspect': 'banks', 'description': 'other'},
 {'aspect': 'banks', 'description': 'Few'},
 {'aspect': 'brand', 'description': 'new'},
 {'aspect': 'banks', 'description': 'limited'},
 {'aspect': 'banks', 'description': 'own'},
 {'aspect': 'banks', 'description': 'retail'},
 {'aspect': '', 'description': 'urgent'},
 {'aspect': 'pace', 'description': 'scale'},
 {'aspect': 'formula', 'description': 'traditional'},
 {'aspect': 'leaders', 'description': ''},
 {'aspect': 'economics', 'description': 'structural'},
 {'aspect': 'pools', 'description': 'biggest'},
 {'aspect': 'banks', 'description': 'broader'},
 {'aspect': 'shifts', 'description': 'traditional'},
 {'aspect': 'incumbents', 'description': 'first'},
 {'aspect': 'players', 'description': 'dominant'},
 {'aspect': 'firms', 'description

### find sentiments on aspects

In [197]:
for aspect in aspects:
  aspect['sentiment'] = TextBlob(aspect['description']).sentiment
pprint.pprint(aspects)

[{'aspect': 'banks',
  'description': 'Retail',
  'sentiment': Sentiment(polarity=0.0, subjectivity=0.0)},
 {'aspect': 'economies',
  'description': 'certain',
  'sentiment': Sentiment(polarity=0.21428571428571427, subjectivity=0.5714285714285714)},
 {'aspect': 'institutions',
  'description': 'similar',
  'sentiment': Sentiment(polarity=0.0, subjectivity=0.4)},
 {'aspect': 'bank',
  'description': 'purposeful',
  'sentiment': Sentiment(polarity=0.0, subjectivity=0.0)},
 {'aspect': 'banks',
  'description': 'other',
  'sentiment': Sentiment(polarity=-0.125, subjectivity=0.375)},
 {'aspect': 'banks',
  'description': 'Few',
  'sentiment': Sentiment(polarity=-0.2, subjectivity=0.1)},
 {'aspect': 'brand',
  'description': 'new',
  'sentiment': Sentiment(polarity=0.13636363636363635, subjectivity=0.45454545454545453)},
 {'aspect': 'banks',
  'description': 'limited',
  'sentiment': Sentiment(polarity=-0.07142857142857142, subjectivity=0.14285714285714285)},
 {'aspect': 'banks',
  'descript

### Using spaCy for SA

In [198]:
# This is not ABSA!!
from textblob import TextBlob

#nlp = spacy.load('en_core_web_sm')
#print(nlp.pipe_names)
#nlp.add_pipe('spacytextblob')

_table = []

for sentence in sentences:
    _row = []
    blob = TextBlob(sentence)
    _row.append(sentence)
    _row.append(blob.sentiment_assessments.polarity)
    _row.append(blob.sentiment_assessments.subjectivity)
    _row.append(blob.sentiment_assessments.assessments)
    _table.append(_row)

    
pd.DataFrame(_table, columns=["SENTENCE", "POLARITY", "SUBJECTIVITY", "ASSESSMENTS"])

Unnamed: 0,SENTENCE,POLARITY,SUBJECTIVITY,ASSESSMENTS
0,Succeeding in the new retail banking landscape...,0.043182,0.427273,"[([new], 0.13636363636363635, 0.45454545454545..."
1,But even those scale economies had limits abov...,0.107143,0.335714,"[([above], 0.0, 0.1, None), ([certain], 0.2142..."
2,"As a result, in most retail banking markets, a...",0.128571,0.357143,"[([most], 0.5, 0.5, None), ([few], -0.2, 0.1, ..."
3,Changes to the retail banking business model h...,0.333333,0.458333,"[([mostly], 0.5, 0.5, None), ([winning], 0.5, ..."
4,Retail banks have also not kept pace with the ...,-0.125,0.375,"[([other], -0.125, 0.375, None)]"
5,Few banks stand out for innovation in customer...,-0.2,0.1,"[([few], -0.2, 0.1, None)]"
6,Marketing investments have traditionally focus...,0.212121,0.668182,"[([traditionally], 0.0, 0.75, None), ([reputab..."
7,"Finally, most banks offer similar products, wi...",0.142857,0.347619,"[([finally, most], 0.5, 0.5, None), ([similar]..."
8,"Today, the moats that banks have built are mor...",0.366667,0.833333,"[([more], 0.5, 0.5, None), ([likely], 0.0, 1.0..."
9,Four shifts are reshaping the global retail ba...,0.4,0.2,"[([global], 0.0, 0.0, None), ([win], 0.8, 0.4,..."
