In [45]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [46]:
!pip install pdfminer.six

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [47]:
pdfs_path = '/content/drive/MyDrive/needl/documents/'

In [48]:
!ls /content/drive/MyDrive/needl/documents

'BHEL 2QFY18 Outlook Review.pdf'
'Cholamandalam Investment Q1FY16 Result Update.pdf'
'Citi- Petronet LNG (PLNG.BO) - 3Q - EBITDA Expectedly Soft, But Volumes Surprise Positively.pdf'
'Deutsche Bank- Indraprastha Gas Alert -Visibility for growth remains weak beyond FY17, maintain Hold.pdf'
 document-1.pdf
 document-2.pdf
 document-3.pdf
 document-4.pdf
 document.pdf
'Emkay Chambal Fertilisers Q2FY17 Result Update-074044.pdf'
'GIC Housing Finance - Company Update - Centrum 20022014.pdf.qk6z28q.pdf'
'IIFL - Coforge - KYC - Initiating Coverage - 20200924.pdf'
 SBI_Cards_Initiating_Coverage_23092020_202009231023023648711.pdf


In [49]:
import re
import os
from pdfminer.high_level import extract_pages, extract_text

In [50]:
pdf_files = os.listdir(pdfs_path)

In [51]:
# get path to list of institutuions and BSE comapnies
list_of_institutions_file = "/content/drive/MyDrive/needl/comp-list.json"
list_of_BSE_companies_file = "/content/drive/MyDrive/needl/bse_companies.csv"

Read the names of companies and their codes from "bse_companies.csv"

In [52]:
import pandas as pd
df = pd.read_csv(list_of_BSE_companies_file, encoding='latin-1')
# Remove NULL values from the dataframe
Company_names = df['Company Name'].fillna(99999999).tolist()
BSE_codes = df['CD_BSE Code'].fillna(99999999).tolist() 
NSE_symbols = df['CD_NSE Symbol'].fillna(99999999).tolist()
Bloomberg_codes = df['CD_Bloomberg Code'].fillna(99999999).tolist()
Reuters_codes = df['CD_Reuters Code'].fillna(99999999).tolist()

# Remove spaces from lists
Company_names = ["99999999" if company_name == " " else str(company_name) for company_name in Company_names]
BSE_codes =  ["99999999" if bse_code == " " else str(int(bse_code)) for bse_code in BSE_codes]
NSE_symbols = ["99999999" if nse_symbol == " " else str(nse_symbol) for nse_symbol in NSE_symbols]
Bloomberg_codes = ["99999999" if bloomberg_code == " " else str(bloomberg_code) for bloomberg_code in Bloomberg_codes]
Reuters_codes = ["99999999" if reuters_code == " " else str(reuters_code) for reuters_code in Reuters_codes]

In [53]:
import json
institutions = open(list_of_institutions_file)
institutions_list = json.load(institutions)

Get names of institutions authoring the reports

In [54]:
def get_institution(pdf_path):
  text = extract_text(pdf_path)
  matched_institutions = {}
  #print(text)
  for institution in institutions_list:
    location = text.find(institution)
    if location != -1:
      matched_institutions[location] = institution
  
  # print(matched_institutions)
  location = (min((matched_institutions.keys())))
  print("Name of the institution: ", matched_institutions[location])
  return matched_institutions[location]

########################################################
def get_institutions_method2(pdf_path):
  for page_layout in extract_pages(pdf_path):
    for element in page_layout:
      print(element)

Get names of Companies mentioed in the PDF documents

In [55]:
def get_companies(pdf_path):
  text = extract_text(pdf_path)
  #print(text)
  matched_companies = []
  for company_name, bse_code, nse_symbol, bloomberg_code, reuters_code in zip(Company_names, BSE_codes, NSE_symbols, Bloomberg_codes, Reuters_codes):
    comapny_name = str(company_name)
    bse_code = str(int(bse_code))
    nse_symbol =  str(nse_symbol)
    bloomberg_code =  str(bloomberg_code)
    reuters_code = str(reuters_code)
    
    if text.find(company_name)!= -1 or text.find(bse_code)!= -1 or text.find(nse_symbol)!= -1 or text.find(bloomberg_code)!= -1 or text.find(reuters_code)!= -1:
      # print(text.find(company_name), text.find(bse_code), text.find(nse_symbol), text.find(bloomberg_code), text.find(reuters_code))
      # print("comapany_name:",comapny_name, "bse_code:",bse_code, "nse_symbol:",nse_symbol, "bloomberg_code:",bloomberg_code, "reuters_code:", reuters_code)
      matched_companies.append(company_name)

  print("Companies present in PDF: ", len(matched_companies),matched_companies)
  return matched_companies

Get Authors

In [56]:
import re
exclude_list = ['investor', 'compliance', 'research', 'mail']
def get_authors(pdf_path):
  text = extract_text(pdf_path)
  emails = re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", text)   # search for email like string in the text
  authors = []
  present = False
  for email in emails:
    for item in exclude_list:
      if item in email:
        present = True
        break
    if present == True:
      break;
    authors.append(email)
    #print(email)
  
  unique_authors = set(authors)
  unique_authors = [unique_author.split('@')[0] for unique_author in unique_authors]  # get the first part of email
  unique_authors = [unique_author.replace(".", " " ) for unique_author in unique_authors]
  #strip non-alphabetic symbols
  print("Authors of PDF: ", unique_authors)
  return unique_authors

In [57]:
output = "/content/drive/MyDrive/needl/output.csv"
import csv
with open(output, 'w') as csvfile: 
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['name of the file',  'author names', 'author institution', 'companies mentioned'])
    for pdf in pdf_files:
      print("name of the pdf: ", pdf)
      pdf_path = pdfs_path + pdf
      _institution = get_institution(pdf_path)          
      _companies = get_companies(pdf_path)
      _authors = get_authors(pdf_path)
      fields = [pdf, _authors, _institution, _companies]
      print("\n")
      # writing the fields
      csvwriter.writerow(fields)

name of the pdf:  Deutsche Bank- Indraprastha Gas Alert -Visibility for growth remains weak beyond FY17, maintain Hold.pdf
Name of the institution:  Deutsche Bank
Companies present in PDF:  4 ['Bharat Petroleum Corporation Ltd.', 'BSE Ltd.', 'GAIL (India) Ltd.', 'Indraprastha Gas Ltd.']
Authors of PDF:  ['harshad katkar', 'amit murarka']


name of the pdf:  GIC Housing Finance - Company Update - Centrum 20022014.pdf.qk6z28q.pdf
Name of the institution:  Bloomberg
Companies present in PDF:  10 ['BSE Ltd.', 'Central Depository Services (India) Ltd.', 'CRISIL Ltd.', 'GIC Housing Finance Ltd.', 'Housing Development Finance Corporation Ltd.', 'ICRA Ltd.', 'Multi Commodity Exchange Of India Ltd.', 'Narayana Hrudayalaya Ltd.', 'Standard Industries Ltd.', 'Tilaknagar Industries Ltd.']
Authors of PDF:  ['aalok shah']


name of the pdf:  SBI_Cards_Initiating_Coverage_23092020_202009231023023648711.pdf
Name of the institution:  Cre
Companies present in PDF:  9 ['BSE Ltd.', 'Central Depository Ser