In [1]:
import os
import csv
import pandas as pd
import re
from boilerpipe.extract import Extractor
from urllib.parse import urlparse
from langdetect import detect
LINKS_DIR = "../1-scrape-google-links/output"
OUT_DIR = "extracted_texts"
if not os.path.exists(OUT_DIR):
    os.mkdir(OUT_DIR)
MIN_TEXT_LENGTH = 1300
STOP_SITES = ("twitter.com", "facebook.com", "youtube.com", "wikipedia", "slideshare.net", ".pdf", "slideplayer.com", "cdp.net", "video", "nationalgeographic.com", "sourcewatch.org", "wikimedia")


In [2]:
class TextExtractor():
    def __init__(self, company, df, spamwriter, *args, **kwargs):
        self.company = company
        self.spamwriter = spamwriter
        self.df = df
    
    def is_url_ok(self, url):
        parsed_url = urlparse(url)
        if any(word in url for word in STOP_SITES):
            print("Skipping due to stop website: {}".format(url))
            return False
        elif any([name.lower() in parsed_url.netloc for name in company.split()]):
            print("Skipping due to company's website: {} in {}".format(self.company, url))
            return False
        elif parsed_url.path in ["", "/", '/en/', '/en']:
            print("Skipping due to front page {}".format(url))
            return False
        return True

    def is_text_ok(self, text):
        if detect(text) != "en":
            print("Skipping: Extracted text not English: {}".format(text))
            return False
        elif len(text) < MIN_TEXT_LENGTH:
            print("Skipping: Extracted text is too short")
            return False
#         elif self.company not in text:
#             print("Skipping: No company name in Extracted text: {}".format(self.company))
#             return False
        return True

    def highlight_kewords(self, text):
        return re.sub('(climate policy|climate change|innovation|climate|pollution|sustainable|devastation|CO2|CO\(2\)|carbon|{})'.format(self.company), r'<mark>\1</mark>', text, flags=re.IGNORECASE)

    def extract_file_data(self):
        data = []
        for index, row in self.df.iterrows():
            url = row["url"]
            if not self.is_url_ok(url):
                continue
            print(url)
            try:
                extractor = Extractor(extractor='ArticleExtractor', url=url)
                extracted_text = extractor.getHTML()
                if self.is_text_ok(extracted_text):
                    spamwriter.writerow((company, url, row.get("title", ""), row.get("text", ""), self.highlight_kewords(extracted_text)))
            except Exception as ex:
                print(ex)

In [3]:
links_number = 0
for company in os.listdir(LINKS_DIR):
#     if company == "PetroChina":
    if os.path.exists("{}/{}.csv".format(OUT_DIR, company)):
        continue
    with open("{}/{}.csv".format(OUT_DIR, company), "w") as f:
        spamwriter = csv.writer(f)
        spamwriter.writerow(["company", "url", "title", "extract", "content"])
        print(company)
        if os.path.isfile(LINKS_DIR+"/"+company):
            continue
        print("Extracting {}".format(company))
        for fl in os.listdir("{}/{}".format(LINKS_DIR, company)):
            if fl.endswith(".csv"):
                df = pd.read_csv("{}/{}/{}".format(LINKS_DIR, company, fl))
                links_number += len(df)
                TextExtractor(company, df, spamwriter).extract_file_data()

Sony Corp.
Extracting Sony Corp.
http://www.corporatecritic.org/company.aspx?idpath=18067
Skipping: Extracted text is too short
Skipping due to stop website: http://www.acerfoundation.org.tw/acer/file/Image/10_Sony_20081217_Acer_Forum.pdf
https://ipolitics.ca/2018/01/24/lobby-wrap-biofuels-top-recent-lobby-registrations/
https://www.cota.org.au/policy/national-policy-forums/policy-forum-2018/
http://news.abs-cbn.com/business/tech-biz/03/19/15/sony-launches-playstation-vue-online-tv-service
http://www.businessinsider.com/environmentalism-m-and-ms-mars-wind-power-tv-adverts-2017-9
Skipping due to front page https://www.edf.org/
https://proquest.libguides.com/SONY/challenge
https://recruit-holdings.com/ir/ir_news/2018/0227_8127.html
http://www.macleans.ca/canada-top-50-socially-responsible-corporations-2013/
http://www.biofuelsdigest.com/bdigest/2015/10/25/81-companies-sign-american-business-act-on-climate-change-pledge/
Skipping due to stop website: https://www.cfasociety.org/netherlands

https://www.usatoday.com/story/money/2017/06/02/exxon-mobil-rex-tillerson-eric-schneiderman-climate-change/102418492/
Skipping due to front page http://edfclimatecorps.org/
https://www.instagram.com/dyrbergkern_official/?hl=en
Skipping: Extracted text is too short
https://www.cnbc.com/2016/04/28/sony-just-posted-a-666-rise-in-profit-as-its-turnaround-plan-takes-hold.html
https://thebulletin.org/learning-sony-hack-attack8007
https://www.bloomberg.com/news/articles/2017-11-16/norway-s-1-trillion-wealth-fund-wants-out-of-oil-and-gas-stocks
https://insideclimatenews.org/news/12012017/wyoming-coal-wind-energy-solar-energy-climate-change-denial
Skipping due to stop website: https://www.sourcewatch.org/index.php/Sony-Ericsson
http://www.thetelegram.com/business/snoopy-joining-sony-music-unit-buying-stake-in-peanuts-209938/
https://journals.uair.arizona.edu/index.php/JPE/article/download/21522/21088
unknown encoding: application/pdf
http://www.taradeporte.com/speaking
https://sydney.edu.au/cou

Skipping due to company's website: Total SA in https://www.sustainable-performance.total.com/en/reporting/reporting-standards/cdp
http://blogs.ei.columbia.edu/2016/10/12/shareholders-turn-up-the-heat-on-climate-change/
Skipping due to stop website: https://www.sustainable-performance.total.com/sites/shared/sustainable/files/atoms/files/totals_response_to_cdp_2017_climate_change_information_request_-_03-07-2017_0.pdf
https://www.epa.gov/climate-indicators/climate-change-indicators-snowfall
http://blogs.law.columbia.edu/climatechange/2016/06/09/major-oil-and-gas-company-publishes-climate-action-plan-to-align-with-ieas-2c-scenario-by-2035/
https://www.economist.com/news/business/21699141-climate-conscious-shareholders-are-putting-big-oil-spot-greens-pinstriped-suits
https://www.wired.com/story/guide-climate-change/
https://www.governmenteuropa.eu/total-ccus-eliminate-greenhouse-gas/86701/
https://www.cfr.org/report/global-climate-change-regime
https://www.footprintnetwork.org/our-work/cli

https://www.greenpeace.org/archive-international/en/press/releases/2013/Independent-research-exposes-who-is-responsible-for-climate-change-Gazprom-and-Shell-star/
https://www.ft.com/content/7fc597ee-1dc4-11e7-b7d3-163f5a7f229c
http://www.climatechangenews.com/2014/05/01/arrests-as-greenpeace-targets-gazprom-arctic-oil-shipment/
https://thebarentsobserver.com/en/ecology-industry-and-energy/2016/09/climate-change-could-jeopardize-yamal-gas-development-government
https://www.financialexpress.com/industry/climate-change-coal-india-among-top-3-companies-emitting-co2-in-the-world/914999/
Skipping due to stop website: http://pgnig.pl/documents/10184/1749782/20170519_PGNiG_observations_FINAL_public_version.pdf/27baab84-af99-45b2-bbce-b896ae2ea790
Skipping due to company's website: GazProm in http://www.gazprom-mt.com/WhoWeAre/OurTaxStrategy/Pages/default.aspx
https://www.arctictoday.com/norway-isnt-happy-gazprom-omv-asset-swap-deal-says-energy-minister/
http://euanmearns.com/tag/gazprom/
https

Skipping due to stop website: https://www.iea.org/publications/freepublications/publication/WEO2014.pdf
https://www.globalccsinstitute.com/institute
https://www.nbcnews.com/business/energy/india-near-deal-westinghouse-build-6-nuclear-reactors-official-n485116
https://www.inc.com/will-yakowicz/biggest-big-business-fails-of-2015.html
http://fortune.com/2017/01/12/exxon-mobil-massachusetts-climate-change/
Skipping due to company's website: Toshiba Corp. in http://toshiba-global.mynewsdesk.com/blog_posts/giving-co2-an-economic-value-carbon-capture-technology-helps-recycle-waste-into-resources-64737
Skipping due to stop website: https://www.ap7.se/app/uploads/2018/01/Black-List-dec-2017.pdf
http://triblive.com/business/headlines/10595279-74/deal-india-westinghouse
https://www.forbes.com/sites/kensilverstein/2017/08/13/will-saving-nuclear-energy-in-south-carolina-preserve-the-planet/
IncompleteRead(0 bytes read)
https://www.researchgate.net/publication/314234129_Corporate_Social_Responsibili

Skipping due to stop website: https://www.carbon-markets.go.jp/wp-content/uploads/2017/11/4_City_of_Yokohama.pdf
https://www.mynewsdesk.com/toshiba-global/blog_posts/tag/yokkaichi-operations
https://www.jase-we.org/about-us/
https://www.marketwatch.com/story/trump-administration-stymies-push-for-better-climate-risk-disclosure-2017-07-24
https://www.americanscientist.org/article/a-nuke-on-the-yukon
HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
Moved Temporarily
https://neutronbytes.com/2017/01/
Remote end closed connection without response
http://www.sciencemag.org/news/2017/05/goodbye-smokestacks-startup-invents-zero-emission-fossil-fuel-power
Skipping due to stop website: http://www.iooc.us/wp-content/uploads/2010/09/Adapting-to-Climate-Change-A-Business-Approach.pdf
https://www.linkedin.com/company/toshiba-of-canada
HTTP Error 999: Request denied
Skipping due to stop website: https://nl.medical.canon/wp-

Skipping due to stop website: http://h30261.www3.hp.com/~/media/Files/H/HP-IR/documents/reports/2018/2017-10-ka-v1.pdf
https://www.tandfonline.com/doi/full/10.1080/17441056.2015.1033213?src=recsys
Skipping due to front page http://climatechangereconsidered.org/
Skipping due to stop website: https://www.atmos-chem-phys.net/15/8217/2015/acp-15-8217-2015.pdf
https://www.environmentalleader.com/2016/06/spotlight-on-award-winners-advanced-micro-devices-25x20-energy-efficiency-initiative/
https://www.fool.com/investing/2018/05/02/3-costs-cryptocurrency-miners-need-to-know-and-the.aspx
http://www.cleanairpartnerstx.org/newsarchives.html
https://www.cnet.com/news/titan-steals-no-1-spot-on-top500-supercomputer-list/
https://www.thestreet.com/story/14443155/1/amd-s-4-plunge-is-a-buying-opportunity-chart.html
HTTP Error 404: Not Found
https://www.computerworld.com/article/2521577/high-performance-computing/supercomputers-with-100-million-cores-coming-by-2018.html
https://www.worldwildlife.org/pre

https://www.usnews.com/news/best-states/california/articles/2018-03-21/judge-holds-climate-change-class-in-suits-against-big-oil
HTTP Error 403: Forbidden
https://www.kqed.org/science/1921475/live-blog-federal-judge-oversees-climate-change-tutorial
Skipping: Extracted text is too short
http://jurist.org/paperchase/2018/01/new-york-city-files-climate-change-lawsuit-against-oil-companies.php
https://www.washingtonpost.com/news/energy-environment/wp/2018/01/10/new-york-city-sues-shell-exxonmobil-and-other-oil-majors-over-climate-change/
https://www.wired.com/story/why-climate-change-skeptics-are-backing-geoengineering/
https://www.independent.co.uk/environment/chevron-oil-warn-climate-change-lawsuits-drilling-greenhouse-emissions-economically-infeasible-a7609411.html
https://www.nasdaq.com/article/oil-majors-face-lawsuits-on-climate-change-issues-cm856519
The read operation timed out
http://www.businessinsurance.com/article/20180322/NEWS06/912320057/Chevron-says-it-will-not-dispute-climat

https://seekingalpha.com/article/4158876-chevrons-climate-liability-defense-big-tobacco-lawsuit
Skipping due to company's website: Chevron Corp. in http://chevron.com/climateriskperspective
https://www.sfchronicle.com/bayarea/article/People-cause-climate-change-but-don-t-blame-12771823.php
https://www.eastbaytimes.com/2018/01/22/another-east-bay-city-sues-oil-companies-over-climate-change/
https://www.wespath.com/update-wespath-and-hermes-withdraw-chevron-stress-test-proposal/
http://www.tandfonline.com/doi/abs/10.1080/02646811.2015.1127673
https://www.vice.com/en_us/article/43qw3j/meet-the-lawyer-trying-to-make-big-oil-pay-for-climate-change
https://corpgov.law.harvard.edu/2016/09/06/climate-change-sustainability-and-other-environmental-proposals/
https://fivethirtyeight.com/features/who-should-pay-for-climate-change/
http://gsm.ucdavis.edu/post/chevron-challenges-immersion-students-sustainability-goals
http://www.naturalgasintel.com/articles/111829-big-oil-operators-sued-by-san-franc

Skipping due to stop website: https://sustainabledevelopment.un.org/content/documents/3294cunha_presentation.pdf
https://www.usnews.com/news/top-news/articles/2018-01-03/petrobras-to-pay-295-billion-to-settle-us-class-action-over-corruption
HTTP Error 403: Forbidden
https://mlexmarketinsight.com/insights-center/editors-picks/anti-bribery-and-corruption/cross-jurisdiction/heritage-banque-executive-handed-swiss-fine-over-petrobras-failings
https://www.shell.com/media/news-and-media-releases/2017/shell-and-petrobras-sign-technical-cooperation-agreement-to-strengthen-deep-water-partnership.html
https://www.rigzone.com/news/petrobras_announces_intention_to_join_ogci-26-jan-2018-153288-article/
http://www.pennenergy.com/articles/pennenergy/2018/04/oil-and-gas-bp-and-petrobras-form-strategic-alliance.html
HTTP Error 405: Not Allowed
https://news.mongabay.com/2017/12/brazil-uk-push-offshore-oil-pact-a-potential-climate-change-disaster/
Skipping due to stop website: https://www.duo.uio.no/bitst

Skipping due to company's website: Samsung Electronics Co., Ltd. in https://displaysolutions.samsung.com/fileDownload/24566?dir=solutions&file=education
http://news.abs-cbn.com/business/03/30/17/in-pictures-samsung-challenges-apple-bets-revival-on-galaxy-s8
Skipping due to stop website: http://www.lg.com/eg_ar/download/pr-sustainability-report-introduction.pdf
https://www.ncbi.nlm.nih.gov/pubmed/26319589
https://www.bloomberg.com/quote/005930:KS
https://www.brinksgilson.com/green-tech-ip-after-withdrawal-from-the-paris-climate-agreement
https://www.businesswire.com/news/home/20150506005267/en/Samsung-President-Young-Sohn-Unveil-New-Platforms
Skipping due to company's website: Samsung Electronics Co., Ltd. in https://shop.samsung.com/uk/track-order/
https://www.nrcan.gc.ca/energy/products/energystar/participants/manufacturers/13058
Skipping due to stop website: https://www.samsung.com/us/smg/content/dam/samsung/us/aboutsamsung/2017/about-us-sustainability-report-and-policy-environmantal

https://www.nasdaq.com/article/oil-majors-face-lawsuits-on-climate-change-issues-cm856519
The read operation timed out
Skipping due to stop website: http://www.iccr.org/sites/default/files/resources_attachments/response_to_shareholders_jan2915.pdf
http://www.jwnenergy.com/article/2018/4/big-oil-bids-burnish-credentials-war-climate-change/
http://www.ibtimes.com/royal-dutch-shell-tells-its-investors-adopt-climate-change-resolution-first-major-oil-1799234
https://www.desmogblog.com/2017/03/16/have-oil-majors-changed-their-tune-climate-change
https://nonprofitquarterly.org/2017/09/27/cities-storm-barricades-sue-exxon-floods-pestilence/
Skipping due to company's website: Royal Dutch Shell PLC in https://blogs.shell.com/2016/03/29/mitscenarios/
https://uk.reuters.com/article/us-usa-oil-climatesuits/california-cities-sue-big-oil-firms-over-climate-change-idUKKCN1BV2MI
https://rctom.hbs.org/submission/royal-dutch-shell-oil-and-gas-for-environmental-sustainability/
https://www.huffingtonpost.c

HTTP Error 500: Internal Server Error
Skipping due to stop website: https://www.fws.gov/southwest/es/documents/R2ES/LitCited/LPC_2012/Prinn_et_al_2011.pdf
https://nypost.com/2014/02/01/big-oil-not-only-believes-in-global-warming-theyll-profit-from-it/
Remote end closed connection without response
https://www.zacks.com/stock/news/299322/will-exxonmobils-xom-efforts-to-slow-climate-change-work
http://www.internationalfinance.com/energy/shell-publishes-strategy-energy-transmiton/
https://www.lexislegalnews.com/articles/23378/new-york-city-sues-5-oil-companies-over-climate-change
Skipping: Extracted text is too short
https://arstechnica.com/tech-policy/2018/01/nyc-sues-oil-companies-for-the-cost-of-adapting-to-climate-change/
http://financialpost.com/commodities/energy/stay-the-course-on-climate-change-while-being-ready-to-pivot-shell-canada-urges
Remote end closed connection without response
https://www.hbsslaw.com/cases/climate-change-global-warming
https://www.marketwatch.com/story/exxo

In [None]:
# df = pd.read_csv("../../2-Baseline/no_text.csv")
# texts = []
# for num, row in df.iterrows():
#     print(num)
#     texts.append(Extractor(extractor='ArticleExtractor', url=row['url']).getHTML())

In [None]:
# df2 = pd.read_csv("../3-annotation/MTurkBatchesResults/truth.csv")
# for num, row in df.iterrows():
# #     print( row['url'])
#     row2 = df2.loc[df2['Input.url'] == row['url']]
#     print(len(row['full_text']))
# pd.merge(df, df2, on='url', how='right')

In [32]:
# df = pd.read_csv(OUT_FILE_NAME)
print("SUMMARY: Extracted texts of {} articles out of {} links".format(len(df), links_number))
df["netloc"] =  df.apply(lambda row: urlparse(row.url).netloc,axis=1)
print("MOST POPULAR SOURCES: {}".format(df.groupby("netloc").size().sort_values(ascending=False, inplace=False)))

SUMMARY: Extracted texts of 10 articles out of 0 links
MOST POPULAR SOURCES: netloc
www.triplem.com.au          1
www.nexans.com              1
www.marymattingly.com       1
www.jobsinberlin.eu         1
www.bath.ac.uk              1
wamgroup.com                1
plq.org                     1
globalwarming-facts.info    1
en.wiktionary.org           1
eie.academia.edu            1
dtype: int64
