In [None]:
import os
from typing import List
import google.generativeai as genai
from PyPDF2 import PdfReader
import csv
from io import StringIO
import re
import pandas as pd
# ────────────────────────────────────────────────────────────────────────────────
# CONFIGURATION
# ────────────────────────────────────────────────────────────────────────────────

API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyCsD_n_eLu6ZkUPY99tNnvBguTU7gtCI34")
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-04-17")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_pdf(path: str) -> str:
    """Extract all text from a PDF file."""
    reader = PdfReader(path)
    full_text = []
    for page in reader.pages:
        txt = page.extract_text()
        if txt:
            full_text.append(txt)
    return "\n".join(full_text)

In [None]:
def build_full_prompt(pdf_text: str, strategies: List[str]) -> str:
    instruction = """
Please score the following fund strategy based on its alignment with each of the six ESG categories described below. 
Assign scores between 0 and 1, where: A score of 1 indicates that the category is a primary focus of the strategy. 
Scores between 0.1 and 0.9 indicate partial alignment. A score of 0 indicates that the category is not relevant to the strategy. 

ESG Categories are:  
A. Apply Exclusions  
B. Limit ESG Risk  
C. Seek ESG Opportunities  
D. Practice Active Ownership  
E. Target Sustainability Themes  
F. Assess Impact  

You can get the description of the strategies from the attached text.  
The text that is provided also contains relevant information about ESG in itself and other definitions  
which helps in reasoning through the task. I want the model to learn the relationships and provide  
an output which is reasoned through all the content that has been given. Make sure to take in the  
understanding of each strategy carefully. I want the model to reason well and understand complex relationships.  

For each of the following texts, output a CSV table with columns  
  Text, Apply Exclusions, Limit ESG Risk, Seek ESG Opportunities,  
  Practice Active Ownership, Target Sustainability Themes, Assess Impact.
""".strip()

    # assemble
    parts = [
        instruction,
        "\n\n=== ATTACHED TEXT (PDF CONTEXT) ===\n",
        pdf_text,
        "\n\n=== STRATEGY TO SCORE ===\n"
    ]
    for s in strategies:
        # wrap strategies in quotes if they contain commas
        parts.append(f"\"{s.strip()}\"\n")
    return "".join(parts)

In [None]:
def call_gemini(prompt: str) -> str:
    response = model.generate_content(
        prompt, 
        # generation_config=genai.types.GenerationConfig(
        #     max_output_tokens=2048 
        # )
    )
    return response.text


In [5]:
def clean_text(text: str) -> str:
    """Remove newlines and extra spaces from text"""
    return re.sub(r'\s+', ' ', text.replace('\n', ' ')).strip()

In [None]:
def clean_and_parse_csv(raw_csv: str):
    cleaned = re.sub(r'``````', '', raw_csv).strip()
    input_io = StringIO(cleaned)
    reader = csv.reader(input_io)
    rows = list(reader)
    cleaned_rows = []
    for i, row in enumerate(rows):
        if i == 0:
            # Header row
            cleaned_rows.append(row)
        else:
            # Clean only the first column (Text)
            row[0] = re.sub(r'\s+', ' ', row[0].replace('\n', ' ')).strip()
            cleaned_rows.append(row)
    return cleaned_rows

In [7]:
df = pd.read_csv("esg_dataset_no_duplicates.csv")
df = df[['text']]
df_list = df['text'].tolist()
df_list

['The fund excludes companies in controversial sectors such as tobacco and firearms. It also conducts periodic evaluations and cohesive reporting across all focus areas.',
 'The fund integrates ESG risk assessments into its investment models to mitigate potential sustainability risks. It also conducts periodic evaluations and cohesive reporting across all focus areas.',
 'The fund invests in firms pioneering sustainable technologies and ESG-focused business models. It also conducts periodic evaluations and cohesive reporting across all focus areas.',
 'The fund actively engages with portfolio companies through shareholder voting, dialogues, and proxy proposals to drive ESG improvements. It also conducts periodic evaluations and cohesive reporting across all focus areas.',
 'The fund focuses capital on sustainability themes like renewable energy, clean water, and circular economy initiatives. It also conducts periodic evaluations and cohesive reporting across all focus areas.',
 'The fu

In [8]:
if __name__ == "__main__":
    pdf_context = read_pdf("Morningstar vocabulary ESG.pdf")
    # strategies = [
    #     """The fund avoids companies with
    #         poor human rights records, invests in renewable energy projects, and actively engages with
    #         companies to improve labor practices.""",

    #         """The fund focuses on integrating ESG data into its investment process to manage risks and enhance
    #         long-term returns. It actively engages with companies on environmental practices, avoids
    #         investments in sectors like tobacco and firearms, and invests in companies that lead in clean
    #         energy and social inclusion initiatives""", 

    #         """The fund prioritizes investing in
    #         companies that demonstrate strong commitments to diversity and social equity. It avoids sectors
    #         associated with high carbon emissions, such as fossil fuels and heavy manufacturing. The fund
    #         actively uses its shareholder voting rights to influence corporate governance and engages in
    #         dialogues to improve environmental policies. Additionally, it seeks investments in sectors that
    #         contribute to sustainable urban development and clean water initiatives.""",

    #         """The fund emphasizes investing in technology companies that are pioneering innovative solutions to
    #         reduce their environmental footprint. It also seeks to invest in firms with strong workplace policies
    #         promoting employee well-being and diversity. The fund avoids companies with significant legal or
    #         regulatory issues but does not explicitly exclude any particular sectors.""",

    #         """The fund seeks stable returns by investing in established consumer goods companies that have
    #         demonstrated resilience in their supply chains. It looks for companies that are gradually adopting
    #         more sustainable sourcing practices and have a history of community involvement. However, it does
    #         not explicitly prioritize environmental or social metrics in its selection process.""",
    #     ]
    strategies = df_list

    raw_responses = []
    for strategy in strategies:
        prompt = build_full_prompt(pdf_context, [strategy])
        result = call_gemini(prompt)
        raw_responses.append(result)

    # Process all responses and save
    all_cleaned_rows = []
    for raw_csv in raw_responses:
        cleaned_rows = clean_and_parse_csv(raw_csv)
        if cleaned_rows:
            all_cleaned_rows.append(cleaned_rows)

    # Write to combined CSV
    output_path = "esg_scores_combined.csv"
    with open(output_path, "w", newline="", encoding="utf-8") as f:
        writer = None
        header_written = False
        for cleaned_rows in all_cleaned_rows:
            if not cleaned_rows:
                continue
            header = cleaned_rows[0]
            data_rows = cleaned_rows[1:]
            if writer is None:
                writer = csv.writer(f)
                writer.writerow(header)
            for row in data_rows:
                writer.writerow(row)


    # Print the combined CSV content
    with open(output_path, "r", encoding="utf-8") as f:
        print(f.read())

```csv
Text,Apply Exclusions,Limit ESG Risk,Seek ESG Opportunities,Practice Active Ownership,Target Sustainability Themes,Assess Impact
The fund excludes companies in controversial sectors such as tobacco and firearms. It also conducts periodic evaluations and cohesive reporting across all focus areas.,1,0,0,0,0,0
```
Text,Apply Exclusions,Limit ESG Risk,Seek ESG Opportunities,Practice Active Ownership,Target Sustainability Themes,Assess Impact
The fund integrates ESG risk assessments into its investment models to mitigate potential sustainability risks. It also conducts periodic evaluations and cohesive reporting across all focus areas.,0,1,0,0,0,0.9
```
Text,Apply Exclusions,Limit ESG Risk,Seek ESG Opportunities,Practice Active Ownership,Target Sustainability Themes,Assess Impact
The fund invests in firms pioneering sustainable technologies and ESG-focused business models. It also conducts periodic evaluations and cohesive reporting across all focus areas.,0.1,0.4,1,0,1,1
```
Text,Ap