In [5]:
import pandas as pd
import random

df = pd.read_csv('./headlines_dataset/esg_headlines.csv',encoding='cp1252', low_memory=False)

df.drop(['Unnamed: 0', 'mentions_company'], axis=1, inplace=True)

df_env = df[(df['esg_category'] == 'environmental') & (df['guardian_keywords'].str.len() > 15)]
df_social = df[ (df['esg_category'] == 'social') & (df['guardian_keywords'].str.len() > 8)]
df_gov = df[ df['esg_category'] == 'governance']
df_other = df[ df['esg_category'] == 'non-esg']

dfs = [df_env, df_social, df_gov, df_other]
for d in dfs: print(f"shape of {d['esg_category'].iloc[0]}: {d.shape}")

# select 2000 random rows from each category
for d in dfs:
    d = d.sample(n=3000, random_state=1)
    d.to_csv(f'./headlines_dataset/esg_headlines_{d["esg_category"].iloc[0]}.csv',index=False, encoding='utf-8', header=True, sep=',')

                                            headline  \
0  General Motors seeks to reassure Vauxhall on U...   
1  SSE powers to 40% rise in retail profits despi...   
2  Facebook’s cats are the new opium of the peopl...   
3  SSE plans to triple renewable energy productio...   
4  Tesco and Sainsbury's ban plastic cotton buds ...   

                                   guardian_keywords   esg_category  
0                                     ['job losses']         social  
1                                    ['environment']  environmental  
2                                         ['others']        non-esg  
3  ['renewable energy', 'environment', 'climate c...  environmental  
4  ['plastics', 'pollution', 'waste', 'environment']  environmental  
shape of environmental: (10660, 3)
shape of social: (23222, 3)
shape of governance: (4880, 3)
shape of non-esg: (381546, 3)


In [21]:
from duckduckgo_search import DDGS
from newspaper import Article
import pandas as pd
from tqdm.notebook import trange
import logging
import requests
import traceback
import time
import random

USER_AGENTS = [
   #Chrome
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    #Firefox
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]


def get_article_url(query:str)->str:
    """
     Get link to article. This is a wrapper around DDGS. text () to get the link to the article
     
     Args:
     	query: the query to search for
     
     Returns: 
     	the link to the article or None if not found 
    """
    
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://www.duckduckgo.com",
    "Connection": "keep-alive",
    "Cache-Control": "no-cache",
    "Pragma": "no-cache"
    }

    results = []
    try:
        with DDGS(headers=headers) as ddgs:
            results = [r for r in ddgs.text(query + " - the guardian", max_results=5, region='uk-en', )]
            if results:
                for result in results:
                    link = result['href']
                    if "theguardian" in link:
                        return link  # Return the first valid link found
                print(f"No valid Guardian link found for query: {query}")
                return None
            else:
                print(f"No results found for query: {query}")
                return None
    except Exception as e:
        print(f"Error occurred in get_article_url for query '{query}': {e}")
        print(traceback.format_exc())
        print(f"results: {results}")
        return None


def get_article_summary(url: str) -> str:
    """
     Get the summary of an article. This is a wrapper around Article. download () and Article. parse ()
     
     Args:
     	url: URL of the article to retrieve
     
     Returns: 
     	String representation of the article's summary or None if there was an error fetching the article from the url.
    """
    # article = Article(url)
    # article.download()
    # article.parse()
    # article.nlp()
    # return article.summary
    
    if not url:
        return None
    
    try:
        article = Article(url)
        article.download()
        article.parse()
        article.nlp()
        return article.summary
    except Exception as e:
        print(f"An error occurred while fetching the article summary: {type(e).__name__}, {e}")
        return None
    

df = pd.read_csv("./esg_only_headlines.csv", encoding="utf-8")

df_res = pd.DataFrame(columns=["headline", "esg_category", "url", "summary"])


N = 5
for i in trange(1000, 1000+N, desc="Fetching summaries"):
    headline = df.iloc[i]["headline"]
    esg_category = df.iloc[i]["esg_category"]
    
    url = get_article_url(headline)
    summary = get_article_summary(url)
    
    df_res.loc[i] = [headline, esg_category, url, summary]    


Fetching summaries:   0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
df_res

Unnamed: 0,headline,esg_category,url,summary
0,General Motors seeks to reassure Vauxhall on U...,social,https://www.theguardian.com/business/2009/dec/...,"Nick Reilly, the new head of GM Europe, has gi..."
1,SSE powers to 40% rise in retail profits despi...,environmental,https://www.theguardian.com/business/2015/may/...,SSE has rekindled the simmering row over high ...
2,SSE plans to triple renewable energy productio...,environmental,https://www.theguardian.com/environment/2020/n...,SSE has set out plans to triple its renewable ...
3,Tesco and Sainsbury's ban plastic cotton buds ...,environmental,https://www.theguardian.com/environment/2016/n...,The UK’s two largest supermarket chains have c...
4,BP leads energy companies preparing two major ...,environmental,https://www.theguardian.com/environment/2020/o...,After decades spent extracting fossil fuels fr...
5,Amazon told: time is up for tax avoidance,governance,https://www.theguardian.com/business/2013/jul/...,Tax structures used by Amazon to route billion...
6,McDonald's to scrap Luxembourg tax structure,governance,https://www.theguardian.com/business/2016/dec/...,McDonald’s is to scrap its controversial Luxem...
7,Elon Musk on road to $50bn payout as Tesla's v...,governance,https://www.theguardian.com/technology/2020/ja...,"The Tesla founder, Elon Musk, has taken the fi..."
8,Why Schroders' reshuffle looks like a triumph ...,governance,https://www.theguardian.com/business/2016/mar/...,"Well played, Michael Dobson, that was a terrif..."
9,Royal Mail warns of job losses after sell-off,social,https://www.theguardian.com/uk-news/2013/oct/0...,Royal Mail has warned that more postal workers...


In [None]:
import pandas as pd

# load gold_standard_corpus
dataset = pd.read_csv("./esg_headlines_csv.csv", encoding='cp1252', low_memory=False)


# remove first column and the column "mentions_company"
dataset = dataset.drop(columns=['Unnamed: 0', 'mentions_company'])



# count the number rows where dataset["esg_category"] is not 'non-esg'
non_esg_count = (dataset["esg_category"] != 'non-esg').sum()
print(f"number of esg rows: {non_esg_count}")


# on a separate copy of esg_headlines_csv, remove all rows where esg_category is 'non-esg' and save to a new csv
esg_dataset = dataset[dataset['esg_category'] != 'non-esg']
esg_dataset.to_csv("./esg_only_headlines.csv", index=False)
esg_dataset.shape

dataset.head()