In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy as sp
import fitz 

In [2]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""

    for page_number in range(doc.page_count):
        page = doc[page_number]
        text += page.get_text()

    doc.close()
    return text

In [3]:
pdf_path = "C:\桌面\Desktop\Fluence 10k.pdf"
text = extract_text_from_pdf(pdf_path)

  pdf_path = "C:\桌面\Desktop\Fluence 10k.pdf"


In [4]:
npl= sp.load('en_core_web_sm')
doc= npl(text)

In [5]:
entities = [(ent.text, ent.label_) for ent in doc.ents]


df = pd.DataFrame(entities, columns=['Entity', 'Label'])
df=df.drop_duplicates()
org_df=df[df['Label'] == 'ORG']

In [7]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(npl.vocab)
company_suffixes = ["Inc.", "Ltd.", "Corp.", "LLC", "PLC", "GmbH", "S.A.", "Pty Ltd",'Fluence Energy']
patterns = [npl.make_doc(text) for text in company_suffixes]
matcher.add("COMPANY", None, *patterns)

company_entities = []

for entity in org_df['Entity']:
    entity_doc = npl(entity)
    matches = matcher(entity_doc)
    if matches:
        company_entities.append(entity)

# 
company_df = pd.DataFrame(company_entities, columns=['Company'])
company_df

Unnamed: 0,Company
0,"Fluence Energy, Inc."
1,Fluence Energy
2,LLC
3,"Organization\nFluence Energy, Inc."
4,"Siemens Industry, Inc."
5,"Fluence Energy, Inc.’s"
6,Fluence Energy Global\nProduction Operation
7,Restated Limited Liability Company\nAgreement ...
8,LLC Interests
9,Amended and Restated Limited Liability Agreeme...


In [8]:
relations = []
target_company = "Fluence Energy, Inc."

for sent in doc.sents:
    for company in company_df["Company"]:
        if company != target_company:
            if target_company in sent.text and company in sent.text:
                relations.append((target_company, company, sent.text.replace("\n", "")))

relations_df = pd.DataFrame(relations, columns=["Company1", "Company2", "Sentence"]).drop_duplicates(subset=["Company1", "Company2"])


In [9]:
relations_df['Company1'] = relations_df['Company1'].str.replace('\n', ' ')
relations_df['Company2'] = relations_df['Company2'].str.replace('\n', ' ')

In [10]:
relations_df

Unnamed: 0,Company1,Company2,Sentence
0,"Fluence Energy, Inc.",Fluence Energy,For the transition period from toCommission fi...
2,"Fluence Energy, Inc.","Organization Fluence Energy, Inc.",3PART IITEM 1. BUSINESSInception and Organizat...
3,"Fluence Energy, Inc.",Organization Fluence Energy,3PART IITEM 1. BUSINESSInception and Organizat...
5,"Fluence Energy, Inc.",LLC,"As thesole managing member of Fluence Energy, ..."
12,"Fluence Energy, Inc.",LLC Interests,Current Ownership of Continuing Equity OwnersA...
13,"Fluence Energy, Inc.",LLC Interest,Current Ownership of Continuing Equity OwnersA...
15,"Fluence Energy, Inc.","Fluence Energy, Inc.’s","Therefore, the two Siemens entities collective..."
19,"Fluence Energy, Inc.",the Fluence Energy,"Finally, we have reserved 9,500,000 shares of ..."
20,"Fluence Energy, Inc.","the Fluence Energy, Inc.","Finally, we have reserved 9,500,000 shares of ..."
31,"Fluence Energy, Inc.","Siemens Industry, Inc.","On June 30, 2022, Siemens Industry, Inc. exerc..."


In [11]:
import transformers
from transformers import pipeline

In [12]:
qa_pipeline = pipeline("question-answering")


question = "What is the relationship between Company1 and Company2?"


answers = []
for index, row in relations_df.iterrows():
    context = row['Sentence']
    answer = qa_pipeline(question=question, context=context)
    answers.append((row['Company1'], row['Company2'], answer['answer']))


answers_df = pd.DataFrame(answers, columns=['Company1', 'Company2', 'Answer'])

answers_df

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.





Device set to use cpu


Unnamed: 0,Company1,Company2,Answer
0,"Fluence Energy, Inc.",Fluence Energy,State or other jurisdiction ofincorporation or...
1,"Fluence Energy, Inc.","Organization Fluence Energy, Inc.",Delaware corporation
2,"Fluence Energy, Inc.",Organization Fluence Energy,Delaware corporation
3,"Fluence Energy, Inc.",LLC,operates and controls all the business and aff...
4,"Fluence Energy, Inc.",LLC Interests,approximately28.5%
5,"Fluence Energy, Inc.",LLC Interest,approximately28.5%
6,"Fluence Energy, Inc.","Fluence Energy, Inc.’s",unprecedented growth
7,"Fluence Energy, Inc.",the Fluence Energy,2021 Incentive Award Plan
8,"Fluence Energy, Inc.","the Fluence Energy, Inc.",2021 Incentive Award Plan
9,"Fluence Energy, Inc.","Siemens Industry, Inc.",par value $0.00001 per share


In [13]:
answers_df['Company1'] = answers_df['Company1'].str.replace('\n', ' ')
answers_df['Company2'] = answers_df['Company2'].str.replace('\n', ' ')

answers_df=answers_df.drop_duplicates(subset=['Company1', 'Company2'])
answers_df

Unnamed: 0,Company1,Company2,Answer
0,"Fluence Energy, Inc.",Fluence Energy,State or other jurisdiction ofincorporation or...
1,"Fluence Energy, Inc.","Organization Fluence Energy, Inc.",Delaware corporation
2,"Fluence Energy, Inc.",Organization Fluence Energy,Delaware corporation
3,"Fluence Energy, Inc.",LLC,operates and controls all the business and aff...
4,"Fluence Energy, Inc.",LLC Interests,approximately28.5%
5,"Fluence Energy, Inc.",LLC Interest,approximately28.5%
6,"Fluence Energy, Inc.","Fluence Energy, Inc.’s",unprecedented growth
7,"Fluence Energy, Inc.",the Fluence Energy,2021 Incentive Award Plan
8,"Fluence Energy, Inc.","the Fluence Energy, Inc.",2021 Incentive Award Plan
9,"Fluence Energy, Inc.","Siemens Industry, Inc.",par value $0.00001 per share
