In [9]:
import pandas as pd
import re
import spacy
from collections import Counter
import unicodedata
from bs4 import BeautifulSoup
from boilerpipe.extract import Extractor
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from nltk.corpus import stopwords
stopwords = set(stopwords.words())
ANNOTATION_RESULTS = "../1-Data/3-annotation/compiled_output.csv"
# ANNOTATION_RESULTS = "../1-Data/3-annotation/output.csv"
OUTPUT_FILE = "filtered_by_company_confidence.csv"
df = pd.read_csv(ANNOTATION_RESULTS)
nlp = spacy.load('en_core_web_sm')
alternative_company_names = {"AMD (Advanced Micro Devices)": "AMD",
                    'Royal Dutch Shell PLC': "Shell",
                    "Samsung Electronics Co., Ltd.": "Samsung",
                    "Goodyear Tire & Rubber Co": "Goodyear",
                    "Sumitomo Rubber Industries": "Sumitomo",
                    "Exxon Mobil Corp.": "Exxon",
                    "General Motors Corp.": "GM",
                    "Ford Motor Co.": "Ford",
                    "Toyota Motor Corp.": "Toyota",
                    "Petro China": "PetroChina",
                    'Volkswagen AG': "VW"}

In [10]:
df.columns

Index(['climate_confidence', 'comments', 'company', 'company_confidence',
       'content', 'extract', 'sentiment', 'title', 'url'],
      dtype='object')

In [11]:
def clean_text(html):
    soup = BeautifulSoup(html, "html.parser") # create a new bs4 object from the html data loaded
    for script in soup(["script", "style"]): # remove all javascript and stylesheet code
        script.extract()
    texts = soup.findAll(text=True)
    text = " ".join(t.strip() for t in texts)
    text = unicodedata.normalize("NFKD", text)
    return text
df['text'] = df.apply(lambda row: "{} {}".format(row['title'], clean_text(str(row['content']))), axis=1)
df.drop(df[df.text.str.len() < 150].index, inplace=True)

In [12]:
len(df)

3637

## Company Confidence Baseline:

The baseline is built on frequency of mentions of the company, comparing to other organizations:  
2 - Very Confident that the text is about this company. It's mention fraction => 10% and at least 2 mentions.  
1 - Moderate. It is mentioned but the fraction < 10%.  
0 - Not related to the company. The company is not mentioned in text.  
TODO: take in account how many organizations mentioned, how many words in the text etc.

In [18]:
with open("company-suffix.txt", "r") as fl:
    text = [i for i in fl.read().split('\n') if not i.startswith('//')]
    COMPANY_NAMES_STOP_WORDS = "(" + "$)|(".join(text) + "$)"

In [19]:
def get_company_names(company):
    company_names = [re.sub(COMPANY_NAMES_STOP_WORDS, '', company, flags=re.IGNORECASE).strip().lower()]
    if company in alternative_company_names:
        company_names.append(alternative_company_names[company].lower())
    #Company is often mentioned by part of it's name. e.g. "Royal Dutch Shell" -> "Shell"
#     company_names = set([company] + [i for i in company.split() if len(i)>2])
    return company_names

In [20]:
def find_company_confidence(row):
    company_names = get_company_names(row['company'])
    #if company name in title, it's about the company
    title_rank = 0
    if any([name in str(row['title']).lower() for name in company_names]):
        doc = nlp(row['title'])
        #if other organizations in title it's 1
        title_mentions = len([ent for ent in doc.ents if ent.label_ == "ORG"])
        if title_mentions == 1:
            #assumes only company in title
            title_rank = 2
        elif title_mentions:
            title_rank = 1
    text = row['text']
    doc = nlp(text)
    orgs = [re.sub(COMPANY_NAMES_STOP_WORDS, '', ent.lemma_.strip(), flags=re.IGNORECASE).strip().lower() for ent in doc.ents if ent.label_ == "ORG"]
    orgs_counter = Counter(orgs)
    #sometimes spacy does not recognize company name as org, try in other tokens
    occurences_company = 0
    for sent in doc.sents:
        for token in sent:
            if token.lemma_ in company_names:
                occurences_company += 1
                break
    
    occurences_total = len(orgs)

    if occurences_total == 0:
        fraction = 0
    else:
        fraction = occurences_company/occurences_total
#     if row['url'] == 'https://www.treehugger.com/corporate-responsibility/75-companies-which-backed-global-climate-coalition-lies-about-global-warming.html	':
#         import ipdb; ipdb.set_trace()    
    if fraction > 0.1 and occurences_company > 2:
        return 2, fraction
    if occurences_company > 1:
        if title_rank == 2:
            return 2, 'in_title'
        return 1, fraction
    elif title_rank == 1:
        return 1, 'in_title'
    return 0, fraction


In [23]:
company_confidence = []
company_fraction = []
# orgs_lst = []
for num, row in df.iterrows():
    c_confidence, frac = find_company_confidence(row)
    company_confidence.append(c_confidence)
    company_fraction.append(frac)
    if num%100 == 0:
        print(num)

df['company_confidence_estimation'] = company_confidence
df['company_fraction'] = company_fraction

0
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1800
1900
2000
2100
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3400
3500
3600
3700


In [24]:
df = df[df['company_confidence_estimation'] != 0]

In [26]:
OUTPUT_FILE = "filtered_by_company_confidence.csv"
df.to_csv(OUTPUT_FILE)

In [27]:
len(df)

1734

In [None]:
# print(classification_report(df['company_confidence'], df['company_confidence_estimation'], target_names=('0', '1', '2')))

In [None]:
# df1 = df.copy()
# df1.set_index(['url'], inplace=True)
# def highlight_different(s):
#     if s['company_confidence'] != s['company_confidence_estimation']:
#         return ['background-color: red' if v == s['company_confidence_estimation'] else '' for v in s ]
#     return ['' for v in s]
# df1 = df1[['company', 'company_confidence', 'company_confidence_estimation', 'company_fraction']]
# df1.style.apply(highlight_different, axis=1)[:100]

Climate Confidence Baseline:

In [None]:
df['climate_confidence'].value_counts(normalize=True)

In [None]:
climate_confidence = []
climate_fraction = []
KEYWORDS = ("climate", "fossil", "renewable", "carbon", "environment", "environmental", "warming", "sustainability", "sustainable")

for num, row in df.iterrows():
    in_title = False
    if any([name in str(row['title']).lower() for name in KEYWORDS]):
        in_title = True
        
    doc = nlp(row['text'])
    tokens = split_into_lemmas(doc)
    num = sum([tokens.count(word) for word in KEYWORDS])
#     import ipdb; ipdb.set_trace()
    fraction = num/len(tokens)
    if num >  4:
        climate_confidence.append(2)
    elif num > 2:
        if in_title:
            climate_confidence.append(2)
            climate_fraction.append('in title')
            continue
        climate_confidence.append(1)
    else:
        climate_confidence.append(0)
    climate_fraction.append(fraction)
df['climate_confidence_estimation'] = climate_confidence
df['climate_fraction'] = climate_fraction

In [None]:
df1 = df.copy()
df1.set_index(['url'], inplace=True)
def highlight_different(s):
    if s['climate_confidence'] != s['climate_confidence_estimation']:
        return ['background-color: red' if v == s['climate_confidence_estimation'] else '' for v in s ]
    return ['' for v in s]
df1 = df1[['climate_confidence', 'climate_confidence_estimation', 'climate_fraction']]
df1.style.apply(highlight_different, axis=1)
# df[['climate_confidence', 'climate_confidence_estimation', 'climate_fraction']]

In [None]:
X = df["text"]
y = df["climate_confidence"]
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
vect = CountVectorizer()
cls = SGDClassifier()

pipeline = Pipeline([
    ('vect', vect),
    ('tfidf', TfidfTransformer()),
    ('cls',cls),
#     ('stop_words', stop)
])
pipeline.fit(X_train,y_train)


In [None]:
print("Rule Based")
print(classification_report(df['climate_confidence'], df['climate_confidence_estimation'], target_names=('0', '1', '2')))
print("BoW")
print(classification_report(y_test, pipeline.predict(X_test), target_names=('0', '1', '2')))