In [1]:
from pypdf import PdfReader
from openai import OpenAI
import json
from pathlib import Path
import csv

API_KEY = 'sk-bedceae2ceba437f944db22706354095'
import os

In [2]:
def make_prompt_dividend_detection():
    return """
You are a financial expert.

Go through this content and check if there is any mention of dividends

Your task is to find the dividend the company pays.

Your output must be in JSON format with the following keys:

{
  "is_dividend_mentioned": true/false,
  "dividend_amount": "..."
}

Avoid making assumptions beyond the text.
"""

def make_prompt_dividend_calculation():
    return """
You are a financial expert.

Go through this content and assimilate all the information about dividends

Your task is to find the dividend the company pays in all forms.

Your output must be in JSON format with the following keys:

{
  "dividend_amount_paid": "..."
}

Avoid making assumptions beyond the text. Dont put anything extra, just one absolute value, not per share
if you cannot find anything put 0
"""

In [3]:
def make_dividend_context(res_dic):
    context_string=''
    for res_dic in results:
        if res_dic['is_dividend_mentioned']==True:
            #print(res_dic)            
            context_string+=res_dic['page_text']+'\n'
    return context_string
            
                

In [4]:
#results

In [5]:
company_codes=['QNBK','QFBQ','QNCD','QNNS','SIIS','ZHCD']

In [8]:
api_client = OpenAI(api_key=API_KEY, base_url="https://api.deepseek.com")

output_csv = Path("dividend_history.csv")

# Load existing data with proper nested structure
existing_data = {}
if output_csv.exists():
    with open(output_csv, mode='r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            company = row['company_code']
            year = int(row['year'])
            amount = float(row['dividend_amount_paid'])
            
            if company not in existing_data:
                existing_data[company] = {}
            existing_data[company][year] = amount
            


for company_code in company_codes:
    
    # read through all annual reports in the folder
    report_files_location=f'../extra/{company_code}_annual_reports/'
    for file_name in os.listdir(report_files_location):
        if '.DS' in file_name:
            continue    
        year = int(file_name.split('.pdf')[0])
        
        if company_code in existing_data and year in existing_data[company_code]:
            print(f'Skipping {company_code}, {year} - already in CSV')
            continue
                
        print(file_name,year)
    
        try:
            reader = PdfReader(f"../extra/{company_code}_annual_reports/{file_name}")
        except:
            print('issues reading')
            continue
        print(len(reader.pages))
        system_prompt=make_prompt_dividend_detection()
        results=[]
        for page_num in range(len(reader.pages)):        
            page = reader.pages[page_num]
            #print(page.extract_text())
            text=page.extract_text()
            if len(text)<50:
                continue
            if 'dividen' not in text.lower():
                continue
            print(f'will analyze page {page_num}')
            response = api_client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": text}
                ],
                response_format={'type': 'json_object'}
            )    
            #print(response.choices[0].message.content)
            dic_res = json.loads(response.choices[0].message.content)
            dic_res['page_num']=page_num
            dic_res['page_text']=text
            results.append(dic_res)
            print('*'*10)
    
        context_string=make_dividend_context(dic_res)
        system_prompt = make_prompt_dividend_calculation()
        response = api_client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": context_string}
            ],
            response_format={'type': 'json_object'}
        )    
        print(response.choices[0].message.content)
        dic_res = json.loads(response.choices[0].message.content)    
        dividend_amount_paid=dic_res['dividend_amount_paid']
        if ',' in dividend_amount_paid:
            dividend_amount_paid=dividend_amount_paid.replace(',','')
        try:
            print(float(dividend_amount_paid))         
            dividend_amount_paid=float(dividend_amount_paid)
        except:
            print(f'something fishy in {dividend_amount_paid}')
            continue

        
        # Append to CSV after each successful year
        with open(output_csv, mode='a', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=['company_code', 'year', 'dividend_amount_paid'])
            if f.tell() == 0:  # Write header if file is empty
                writer.writeheader()
            writer.writerow({
                'company_code': company_code,
                'year': year,
                'dividend_amount_paid': dividend_amount_paid
            })
            
        print(f"Saved {year}: {dividend_amount_paid}")


Skipping QNBK, 2015 - already in CSV
Skipping QNBK, 2014 - already in CSV
Skipping QNBK, 2016 - already in CSV
Skipping QNBK, 2017 - already in CSV
Skipping QNBK, 2013 - already in CSV
Skipping QNBK, 2012 - already in CSV
Skipping QNBK, 2010 - already in CSV
Skipping QNBK, 2011 - already in CSV
Skipping QNBK, 2008 - already in CSV
Skipping QNBK, 2020 - already in CSV
Skipping QNBK, 2021 - already in CSV
Skipping QNBK, 2009 - already in CSV
Skipping QNBK, 2023 - already in CSV
Skipping QNBK, 2022 - already in CSV
Skipping QNBK, 2019 - already in CSV
Skipping QNBK, 2018 - already in CSV
Skipping QNBK, 2024 - already in CSV
Skipping QFBQ, 2015 - already in CSV
Skipping QFBQ, 2014 - already in CSV
Skipping QFBQ, 2016 - already in CSV
Skipping QFBQ, 2017 - already in CSV
Skipping QFBQ, 2013 - already in CSV
Skipping QFBQ, 2012 - already in CSV
2010.pdf 2010
25
will analyze page 5
**********
will analyze page 19
**********
will analyze page 21
**********
will analyze page 22
**********
{
  "

In [None]:


company_code='QNBK'




In [None]:

        
# api_client = OpenAI(api_key=API_KEY, base_url="https://api.deepseek.com")
# years=[i for i in range(2015,2025)]

# for year in years:
#     # Skip if already processed
#     if year in existing_data:
#         print(f"Skipping {year} - already in CSV")
#         continue    
#     print(year)
#     file_name=f'04-annual-financial-english-{year}.pdf'
#     try:
#         reader = PdfReader(f"ZHCD.QA/{file_name}")
#     except:
#         print('issues reading')
#         continue
#     print(len(reader.pages))

    
#     system_prompt=make_prompt_dividend_detection()
    
#     results=[]
#     for page_num in range(len(reader.pages)):
#         print(page_num)
#         page = reader.pages[page_num]
#         #print(page.extract_text())
#         text=page.extract_text()
#         if len(text)<50:
#             continue
#         if 'divi' not in text.lower():
#             continue
        
#         response = api_client.chat.completions.create(
#             model="deepseek-chat",
#             messages=[
#                 {"role": "system", "content": system_prompt},
#                 {"role": "user", "content": text}
#             ],
#             response_format={'type': 'json_object'}
#         )    
#         #print(response.choices[0].message.content)
#         dic_res = json.loads(response.choices[0].message.content)
#         dic_res['page_num']=page_num
#         dic_res['page_text']=text
#         results.append(dic_res)
#         print('*'*10)

#     context_string=make_dividend_context(dic_res)
#     system_prompt = make_prompt_dividend_calculation()
#     response = api_client.chat.completions.create(
#         model="deepseek-chat",
#         messages=[
#             {"role": "system", "content": system_prompt},
#             {"role": "user", "content": context_string}
#         ],
#         response_format={'type': 'json_object'}
#     )    
#     print(response.choices[0].message.content)
#     dic_res = json.loads(response.choices[0].message.content)    
#     dividend_amount_paid=dic_res['dividend_amount_paid']
#     if ',' in dividend_amount_paid:
#         dividend_amount_paid=dividend_amount_paid.replace(',','')
#     print(float(dividend_amount_paid))      
#     # Append to CSV after each successful year
#     with open(output_csv, mode='a', newline='') as f:
#         writer = csv.DictWriter(f, fieldnames=['year', 'dividend_amount_paid'])
#         if f.tell() == 0:  # Write header if file is empty
#             writer.writeheader()
#         writer.writerow({
#             'year': year,
#             'dividend_amount_paid': dividend_amount_paid
#         })
        
#     print(f"Saved {year}: {dividend_amount_paid}")



In [7]:
results

[{'is_dividend_mentioned': True,
  'dividend_amount': '5% of the nominal value of share capital',
  'page_num': 5,
  'page_text': 'In the name of Allah, the Most Beneficent, The Most Merciful. Prayer and Peace be upon our \nProphet Mohammed, His Comrades and Relatives.\nIt gives me a great pleasure to present you with the annual report of Qatar First Investment \nBank which reflects our major achievements during the second year of operations and the \nfinancial highlights of the period ending 31st December 2010.\nThe global economy started to show sluggish signs of recovery during 2010 as major \nequity markets recovered parts of their losses, and other financial institutions managed \nto rebuild their capital. This economic recovery varied according to regions. In advanced \neconomies, recovery was slow compared to past standards, whilst in emerging economies, \nthe recuperation was more vigorous.\nThe GCC region has weathered the global meltdown exceptionally well. The respective \ng

In [None]:
# context_string

In [None]:
print(dic_res['dividend_amount_paid'])

In [None]:
page = reader.pages[9]
        
text=page.extract_text()

In [None]:
text

In [None]:
len(context_string)

In [None]:
context_string=make_dividend_context(dic_res)

In [None]:
context_string

In [None]:
system_prompt = make_prompt_dividend_calculation()
response = api_client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": context_string}
    ],
    response_format={'type': 'json_object'}
)    
print(response.choices[0].message.content)