In [20]:
import os
import csv
import pandas as pd
import re
from boilerpipe.extract import Extractor
from urllib.parse import urlparse
from langdetect import detect
LINKS_DIR = "../1-scrape-google-links/scrape_result4"
OUT_FILE_NAME = "extract.csv"
MIN_TEXT_LENGTH = 1300
STOP_SITES = ("twitter.com", "facebook.com", "youtube.com", "wikipedia", "slideshare.net", ".pdf", "slideplayer.com", "cdp.net")


In [21]:
class TextExtractor():
    def __init__(self, company, df, spamwriter, *args, **kwargs):
        self.company = company
        self.spamwriter = spamwriter
        self.df = df
    
    def is_url_ok(self, url):
        parsed_url = urlparse(url)
        if any(word in url for word in STOP_SITES):
            print("Skipping due to stop website: {}".format(url))
            return False
        elif company.lower() in parsed_url.netloc:
            print("Skipping due to company's website: {} in {}".format(self.company, url))
            return False
        elif parsed_url.path in ["", "/", '/en/', '/en']:
            print("Skipping due to front page {}".format(url))
            return False
        return True

    def is_text_ok(self, text):
        if detect(text) != "en":
            print("Skipping: Extracted text not English: {}".format(text))
            return False
        elif len(text) < MIN_TEXT_LENGTH:
            print("Skipping: Extracted text is too short: {}".format(text))
            return False
#         elif self.company not in text:
#             print("Skipping: No company name in Extracted text: {}".format(self.company))
#             return False
        return True

    def highlight_kewords(self, text):
        return re.sub('(climate policy|climate change|innovation|climate|pollution|sustainable|devastation|CO2|CO\(2\)|carbon|{})'.format(self.company), r'<mark>\1</mark>', text, flags=re.IGNORECASE)

    def extract_file_data(self):
        data = []
        for index, row in self.df.iterrows():
            url = row["url"]
            if not self.is_url_ok(url):
                continue
            print(url)
            try:
                extractor = Extractor(extractor='ArticleExtractor', url=url)
                extracted_html = extractor.getHTML()
                if self.is_text_ok(extracted_text):
                    spamwriter.writerow((company, url, row.get("title", ""), row.get("text", ""), self.highlight_kewords(extracted_text)))
            except Exception as ex:
                print(ex)

In [33]:
links_number = 0
with open(OUT_FILE_NAME, "w") as f:
    spamwriter = csv.writer(f)
    spamwriter.writerow(["company", "url", "title", "extract", "content"])
    for company in os.listdir(LINKS_DIR):
        if company == "Chevron":
            print(company)
            if os.path.isfile(LINKS_DIR+"/"+company):
                continue
            print("Extracting {}".format(company))
            fl = os.listdir("{}/{}".format(LINKS_DIR, company))[1]
            if fl.endswith(".csv"):
                df = pd.read_csv("{}/{}/{}".format(LINKS_DIR, company, fl))
                links_number += len(df)
                TextExtractor(company, df, spamwriter).extract_file_data()

Chevron
Extracting Chevron
Skipping due to company's website: Chevron in https://australia.chevron.com/environment/protecting-the-environment
https://www.eenews.net/stories/1060077139
https://www.sciencefriday.com/segments/climate-science-goes-to-court-in-california-oil-case/
Skipping due to stop website: http://www1.nyc.gov/assets/home/downloads/pdf/press-releases/2018/complaint-filed-8031957-20180109.pdf
http://blogs.edf.org/energyexchange/2017/08/25/heres-how-chevrons-next-ceo-can-turn-over-a-new-leaf/
http://www.bdlaw.com/news-2211.html
https://grist.org/briefly/exxonmobil-and-chevron-are-some-of-the-most-influential-climate-lobbyists-yikes/
Remote end closed connection without response
http://www.chicagotribune.com/business/sns-bc-us--california-climate-change-lawsuits-20180321-story.html
http://fortune.com/2016/05/24/exxonmobil-chevron-shareholder-meetings-climate/
https://www.insurancejournal.com/news/national/2018/03/26/484378.htm


In [32]:
df = pd.read_csv(OUT_FILE_NAME)
print("SUMMARY: Extracted texts of {} articles out of {} links".format(len(df), links_number))
df["netloc"] =  df.apply(lambda row: urlparse(row.url).netloc,axis=1)
print("MOST POPULAR SOURCES: {}".format(df.groupby("netloc").size().sort_values(ascending=False, inplace=False)))

SUMMARY: Extracted texts of 9 articles out of 10 links
MOST POPULAR SOURCES: netloc
www.vice.com            1
www.sec.gov             1
www.eastbaytimes.com    1
www.apnews.com          1
thehill.com             1
money.cnn.com           1
inhabitat.com           1
freebeacon.com          1
earther.com             1
dtype: int64


In [28]:
df = pd.read_csv(OUT_FILE_NAME)
df2 = df.head(7)
df2.to_csv('chevron_short.csv')