In [1]:
import os
import csv
import pandas as pd
import re
from boilerpipe.extract import Extractor
from urllib.parse import urlparse
from langdetect import detect
LINKS_DIR = "../1-scrape-google-links/output"
OUT_DIR = "extracted_texts"
if not os.path.exists(OUT_DIR):
    os.mkdir(OUT_DIR)
MIN_TEXT_LENGTH = 1300
STOP_SITES = ("twitter.com", "facebook.com", "youtube.com", "wikipedia", "slideshare.net", ".pdf", "slideplayer.com", "cdp.net", "video", "nationalgeographic.com", "sourcewatch.org", "wikimedia")


In [2]:
class TextExtractor():
    def __init__(self, company, df, spamwriter, *args, **kwargs):
        self.company = company
        self.spamwriter = spamwriter
        self.df = df
    
    def is_url_ok(self, url):
        parsed_url = urlparse(url)
        if any(word in url for word in STOP_SITES):
            print("Skipping due to stop website: {}".format(url))
            return False
        elif any([name.lower() in parsed_url.netloc for name in company.split()]):
            print("Skipping due to company's website: {} in {}".format(self.company, url))
            return False
        elif parsed_url.path in ["", "/", '/en/', '/en']:
            print("Skipping due to front page {}".format(url))
            return False
        return True

    def is_text_ok(self, text):
        if detect(text) != "en":
            print("Skipping: Extracted text not English: {}".format(text))
            return False
        elif len(text) < MIN_TEXT_LENGTH:
            print("Skipping: Extracted text is too short")
            return False
#         elif self.company not in text:
#             print("Skipping: No company name in Extracted text: {}".format(self.company))
#             return False
        return True

    def highlight_kewords(self, text):
        return re.sub('(climate policy|climate change|innovation|climate|pollution|sustainable|devastation|CO2|CO\(2\)|carbon|{})'.format(self.company), r'<mark>\1</mark>', text, flags=re.IGNORECASE)

    def extract_file_data(self):
        data = []
        for index, row in self.df.iterrows():
            url = row["url"]
            if not self.is_url_ok(url):
                continue
            print(url)
            try:
                extractor = Extractor(extractor='ArticleExtractor', url=url)
                extracted_text = extractor.getHTML()
                if self.is_text_ok(extracted_text):
                    spamwriter.writerow((company, url, row.get("title", ""), row.get("text", ""), self.highlight_kewords(extracted_text)))
            except Exception as ex:
                print(ex)

In [3]:
links_number = 0
for company in os.listdir(LINKS_DIR):
#     if company == "PetroChina":
    if os.path.exists("{}/{}.csv".format(OUT_DIR, company)):
        continue
    with open("{}/{}.csv".format(OUT_DIR, company), "w") as f:
        spamwriter = csv.writer(f)
        spamwriter.writerow(["company", "url", "title", "extract", "content"])
        print(company)
        if os.path.isfile(LINKS_DIR+"/"+company):
            continue
        print("Extracting {}".format(company))
        for fl in os.listdir("{}/{}".format(LINKS_DIR, company)):
            if fl.endswith(".csv"):
                df = pd.read_csv("{}/{}/{}".format(LINKS_DIR, company, fl))
                links_number += len(df)
                TextExtractor(company, df, spamwriter).extract_file_data()

Glencore International AG
Extracting Glencore International AG
Skipping due to stop website: https://fastenopfer.ch/content/uploads/2016/03/Report-dfi-Landgrabbing.pdf
https://www.thebalance.com/company-profile-glencore-international-ag-2340298
Skipping due to stop website: https://www.greenpeace.org.uk/wp-content/uploads/2018/04/OffTrackInvestorBriefing.pdf
Skipping due to stop website: http://www.glencore.com.au/en/media-centre/News/Media-Statement-Glencore-progresses-low-emission-coal-project-in-Queensland.pdf
Skipping due to stop website: http://www.longfinance.net/LondonAccord/2017/BBVA_SDGs&Investing_Jan18.pdf
Skipping due to stop website: https://www.greenpeace.de/sites/www.greenpeace.de/files/publications/justice_report_0.pdf
http://www.cadtm.org/Paul-Singer-market-master-or
HTTP Error 412: Precondition Failed
https://www.ripe.net/membership/indices/data/ch.glencore.html
Skipping due to company's website: Glencore International AG in http://www.glencore.ch/
Skipping due to stop

https://www.renewableenergyworld.com/news/2014/12/threatened-by-renewable-energy-fossil-fuel-companies-highlight-their-role-in-alleviating-poverty.html
HTTP Error 405: Not Allowed
https://www.grain.org/article/entries/5492-the-global-farmland-grab-in-2016-how-big-how-bad
https://www.motherjones.com/politics/2011/07/climate-change-food-crisis-price-bread-political-instability/
http://www.worldbank.org/en/country/chad/overview
Skipping due to stop website: https://www.gg.gov.au/sites/default/files/files/honours/qb/qb2009/Media%20Notes%20AO%20%28final%29.pdf
Skipping due to stop website: http://www.oekom-research.com/homepage/english/oekom_cr_review_E_2016.pdf
Skipping due to stop website: https://www.nordea.com/Images/35-170162/RI_Annual_Report_2016_FINAL.pdf
https://thewire.in/economy/surge-in-2015-pulse-prices-was-a-result-of-cartellisation
https://reneweconomy.com.au/coal-lobby-hits-peak-denial-battery-storage-renewables-54187/
https://newint.org/features/web-exclusive/2013/12/04/glen

In [4]:
# df = pd.read_csv("../../2-Baseline/no_text.csv")
# texts = []
# for num, row in df.iterrows():
#     print(num)
#     texts.append(Extractor(extractor='ArticleExtractor', url=row['url']).getHTML())

In [None]:
# df2 = pd.read_csv("../3-annotation/MTurkBatchesResults/truth.csv")
# for num, row in df.iterrows():
# #     print( row['url'])
#     row2 = df2.loc[df2[''] == row['url']]
#     print(len(row['content']))
# pd.merge(df, df2, on='url', how='right')

In [32]:
# df = pd.read_csv(OUT_FILE_NAME)
print("SUMMARY: Extracted texts of {} articles out of {} links".format(len(df), links_number))
df["netloc"] =  df.apply(lambda row: urlparse(row.url).netloc,axis=1)
print("MOST POPULAR SOURCES: {}".format(df.groupby("netloc").size().sort_values(ascending=False, inplace=False)))

SUMMARY: Extracted texts of 10 articles out of 0 links
MOST POPULAR SOURCES: netloc
www.triplem.com.au          1
www.nexans.com              1
www.marymattingly.com       1
www.jobsinberlin.eu         1
www.bath.ac.uk              1
wamgroup.com                1
plq.org                     1
globalwarming-facts.info    1
en.wiktionary.org           1
eie.academia.edu            1
dtype: int64
