In [1]:
!pip install pdfminer.six
!pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://packagecloud.io/github/git-lfs/pypi/simple
Looking in indexes: https://pypi.org/simple, https://packagecloud.io/github/git-lfs/pypi/simple


In [3]:
import csv
import os
from pdfminer.high_level import extract_text

def extract_text_from_pdfs(pdf_directory):
    data = []
    
    for filename in os.listdir(pdf_directory):
        if filename.endswith('.pdf'):
            filepath = os.path.join(pdf_directory, filename)
            
            # Error handling
            try:
                text = extract_text(filepath)
                
                # Parse the filename to extract ticker and year
                parts = filename.split('_')
                ticker = parts[1]
                year = parts[2].split('.')[0]  # Remove the .pdf extension
                
                data.append({
                    'filename': filename,
                    'ticker': ticker,
                    'year': year,
                    'text': text
                })
            except Exception as e:
                print(f"Error processing {filename}. Error: {e}")
    
    return data

def save_to_csv(data, output_filename):
    with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['filename', 'ticker', 'year', 'text']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for row in data:
            writer.writerow(row)

pdf_directory = './tests'
output_csv = 'abc.csv'

data = extract_text_from_pdfs(pdf_directory)
save_to_csv(data, output_csv)



In [4]:
import os
import csv
import PyPDF2
import re
from collections import Counter


def pdf_to_text(pdf_path):
    """
    Convert a PDF file to a list of lists containing lines of text from each page.

    Args:
    - pdf_path (str): The path to the PDF file.

    Returns:
    - list: A list of lists containing lines of text from each page. Returns None if an exception occurs.
    """
    try:
        with open(pdf_path, "rb") as file:
            pdf = PyPDF2.PdfReader(file)
            page_lines_list = []
            for page in pdf.pages:
                page_text = page.extract_text()
                lines = [
                    re.sub(r"[\d]+|page", "", line.strip().lower())
                    for line in page_text.split("\n")
                    if line.strip()
                ]
                page_lines_list.append(lines)
            return page_lines_list
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None


def find_repeating_elements(all_line_list):
    """
    Find repeating elements across all pages of a PDF.

    Args:
    - all_line_list (list): A list of lists containing lines of text from each page.

    Returns:
    - set: A set containing elements that appear more than once.
    """
    flat_list = [elem for sublist in all_line_list for elem in sublist]
    counts = Counter(flat_list)
    repeating_elements = {key for key, val in counts.items() if val > 1}
    return repeating_elements


def remove_elements_from_list(all_line_list, elements_to_remove):
    """
    Remove specified elements from all pages of a PDF.

    Args:
    - all_line_list (list): A list of lists containing lines of text from each page.
    - elements_to_remove (set): A set containing elements to be removed.

    Returns:
    - list: A list of lists after removing the specified elements.
    """
    return [
        [item for item in sublist if item not in elements_to_remove]
        for sublist in all_line_list
    ]


def pdf_to_text_pipeline(pdf_path):
    """
    Pipeline to convert a PDF to a single text string after cleaning.

    Args:
    - pdf_path (str): The path to the PDF file.

    Returns:
    - str: A string containing all text after cleaning.
    """
    page_lines_list = pdf_to_text(pdf_path)
    repeating_elements = find_repeating_elements(page_lines_list)
    cleaned_all_line_list = remove_elements_from_list(
        page_lines_list, repeating_elements
    )
    text = " ".join([" ".join(sublist) for sublist in cleaned_all_line_list])
    return text


def process_pdfs_in_directory(directory, output_csv):
    """
    Process all PDF files in a directory and save the content to a CSV file.

    Args:
    - directory (str): The path to the directory containing PDF files.
    - output_csv (str): The path to the output CSV file.

    Returns:
    - None: Writes to the CSV file.
    """
    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["filename", "ticker", "year", "content"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for file in os.listdir(directory):
            if file.endswith(".pdf"):
                print(f"Processing {file}...")
                filepath = os.path.join(directory, file)
                content = pdf_to_text_pipeline(filepath)
                if content:
                    _, ticker, year = file.rstrip(".pdf").split("_")
                    writer.writerow(
                        {
                            "filename": file,
                            "ticker": ticker,
                            "year": year,
                            "content": content,
                        }
                    )


In [5]:
process_pdfs_in_directory('./tests', 'a.csv')

Processing ABT_archived_1736098442.16996.pdf...
Processing ABT_most_recent.pdf...
Processing ABT_archived_1736098471.2136264.pdf...


In [3]:
import bs4 as bs
import requests
import pickle
import datetime as dt
import pandas as pd
import json
import urllib.request
import yesg

def getting_ESG_scores():
    """
    This function gets the ESG scores for the S&P 500 companies and saves them in a csv file.
    Return: dataframe with the ESG scores per year per company
    """
    # Getting resources from Wikipedia
    resource = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    # Parsing the resources
    soup = bs.BeautifulSoup(resource.text, 'html.parser')
    # Finding the table with the tickers
    table = soup.find('table', {'id': 'constituents'})

    # Creating an empty list for the tickers
    tickers = []
    # Finding all the rows in the table
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        tickers.append(ticker)
    # Removing the \n from the tickers
    tickers = [s.replace('\n', '') for s in tickers]


    # Getting the ESG scores for each ticker
    dataframes = []
    i = 0
    tickers = ['TEMN']
    for ticker in tickers:
        try:
            df = pd.DataFrame(yesg.get_historic_esg(ticker))
            i += 1
            print(ticker, i)
            df['Company_Symbol'] = ticker
            dataframes.append(df)
        except:
            pass
    # Concatenating the dataframes
    df = pd.concat(dataframes)

    df['timestamp'] = df.index
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")

    # Removing the non-values
    df.dropna(inplace=True)

    # Setting dataframe index to timestamp
    df['timestamp'] = df.index

    # Resetting the index
    df.reset_index(drop=True, inplace=True)

    # Setting the timestamp as datetime format
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")

    # Adding the year column to dataframe for calculating the average ESG score per year
    df['year'] = df['timestamp'].dt.year

    # Grouping the dataframe by year and ticker
    cleaned_df = df.groupby(['year', 'Company_Symbol']).mean()

    # Removing the timestamp column
    cleaned_df.drop(columns=['timestamp'], inplace=True)

    # Creating a csv file with the results
    cleaned_df.to_csv('./SP500_EGS_Score_avarage_per_year.csv')

    esg_score = pd.read_csv('./SP500_EGS_Score_avarage_per_year.csv')

    return esg_score

In [None]:
! pip install yesg

Looking in indexes: https://pypi.org/simple, https://packagecloud.io/github/git-lfs/pypi/simple
Collecting yesg
  Downloading yesg-2.1.1.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: yesg
  Building wheel for yesg (setup.py) ... [?25ldone
[?25h  Created wheel for yesg: filename=yesg-2.1.1-py3-none-any.whl size=6105 sha256=9a0a68fbc1ccf1471ca53c8c733d49ac4f0e1d6308b964c5e649ecfe6a0dfbbc
  Stored in directory: /home/pratyush/.cache/pip/wheels/78/8d/48/f5e8ff0315a46301e15c68371e297b460b33e1c846117725bc
Successfully built yesg
Installing collected packages: yesg
Successfully installed yesg-2.1.1


In [4]:
getting_ESG_scores()

An error has occurred. The ticker symbol might be wrong or you might need to wait to continue.
TEMN 1


Unnamed: 0,year,Company_Symbol


In [5]:
import pandas as pd

# Load the averaged scores per company
averaged_scores_file = "averaged_scores_per_company.csv"
averaged_scores_df = pd.read_csv(averaged_scores_file)

# Load the sector data
sector_file = "sector.csv"
sector_df = pd.read_csv(sector_file)

# Merge the two DataFrames based on the company name (or a common column)
# Assuming the common column is 'Company_Symbol' in both files
merged_df = pd.merge(
    averaged_scores_df,  # Left DataFrame
    sector_df,           # Right DataFrame
    on="Company_Symbol", # Common column to merge on
    how="left"           # Keep all rows from the left DataFrame
)

# Save the merged DataFrame to a new CSV file
output_file_path = "merged_averaged_scores_with_sector.csv"
merged_df.to_csv(output_file_path, index=False)

print(f"Merged data saved to {output_file_path}")

Merged data saved to merged_averaged_scores_with_sector.csv


In [6]:
df = pd.read_csv('combined_historical_and_predicted_scores.csv')

In [8]:
df.head(10)

Unnamed: 0,year,Company_Symbol,Total-Score,E-Score,S-Score,G-Score
0,2020,A,17.143333,0.87,9.416667,6.856667
1,2021,A,16.598,0.87,9.44,6.288
2,2022,A,15.33,0.34,8.65,6.34
3,2023,A,14.44,0.73,7.535,6.18
4,2024,A,11.36,1.19,6.23,3.93
5,2025,A,10.472743,1.09092,5.702196,3.592605
6,2026,A,9.583108,0.996409,4.879913,3.395365
7,2027,A,8.693097,0.905262,3.764917,3.337786
8,2028,A,5.613129,1.531027,2.459908,1.078171
9,2029,A,4.725952,1.433372,1.93219,0.740594
