In [1]:
import pandas as pd
import re
import spacy
from collections import Counter
from bs4 import BeautifulSoup
from boilerpipe.extract import Extractor
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from nltk.corpus import stopwords
stop = set(stopwords.words())
ANNOTATION_RESULTS = "../1-Data/3-annotation/output.csv"
df = pd.read_csv(ANNOTATION_RESULTS)
nlp = spacy.load('en_core_web_sm')

Clean text from html tags

In [2]:
def cleanMe(html):
    soup = BeautifulSoup(html) # create a new bs4 object from the html data loaded
    for script in soup(["script", "style"]): # remove all javascript and stylesheet code
        script.extract()
    # get text
    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text
df['text'] = df.apply(lambda row: "{} {}".format(row['title'], cleanMe(row['full_text'])), axis=1)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


## Company Confidence Baseline:

In [3]:
COMPANY_NAMES_STOP_WORDS = r"PLC|Corp|Corporation|—$"
df['Answer.CompanyConfidence'].value_counts(normalize=True)

2    0.673077
1    0.211538
0    0.115385
Name: Answer.CompanyConfidence, dtype: float64

The baseline is built on frequency of mentions of the company, comparing to other organizations:  
2 - Very Confident that the text is about this company. It's mention fraction => 10% and at least 2 mentions.  
1 - Moderate. It is mentioned but the fraction < 10%.  
0 - Not related to the company. The company is not mentioned in text.  
TODO: take in account how many organizations mentioned, how many words in the text etc.

In [4]:
company_confidence = []
company_fraction = []

for num, row in df.iterrows():
    company = re.sub(COMPANY_NAMES_STOP_WORDS, '', row['company'], flags=re.IGNORECASE).lower()
    #Company is often mentioned by part of it's name. e.g. "Royal Dutch Shell" -> "Shell"
    company_names = set([company] + company.split())
    text = row['text']
    doc = nlp(text)
    orgs = [re.sub(COMPANY_NAMES_STOP_WORDS, '', ent.text, flags=re.IGNORECASE).strip().lower() for ent in doc.ents if ent.label_ == "ORG"]
    orgs_counter = Counter(orgs)
    occurences_company = sum([orgs_counter.get(name, 0) for name in company_names])
    occurences_total = sum(orgs_counter.values())
    if occurences_total == 0:
        fraction = 0
    else:
        fraction = occurences_company/occurences_total
    company_fraction.append(fraction)
    if occurences_company == 0:
        company_confidence.append(0)
    elif fraction > 0.1 and occurences_company > 1:
        company_confidence.append(2)
    else:
        company_confidence.append(1)

df['company_confidence_estimation'] = company_confidence
df['company_fraction'] = company_fraction

In [5]:
print(classification_report(df['Answer.CompanyConfidence'], df['company_confidence_estimation'], target_names=('0', '1', '2')))

             precision    recall  f1-score   support

          0       0.23      0.75      0.35        12
          1       0.84      0.73      0.78        22
          2       0.93      0.61      0.74        70

avg / total       0.83      0.65      0.70       104



Debugging: Highlight wrong values

In [6]:
df.set_index(['url'], inplace=True)
def highlight_different(s):
    if s['Answer.CompanyConfidence'] != s['company_confidence_estimation']:
        return ['background-color: red' if v == s['company_confidence_estimation'] else '' for v in s ]
    return ['' for v in s]
df2 = df[['Answer.CompanyConfidence', 'company_confidence_estimation', 'company_fraction']]
df2.style.apply(highlight_different, axis=1)

Unnamed: 0_level_0,Answer.CompanyConfidence,company_confidence_estimation,company_fraction
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
http://barrelperday.com/tag/gazprom/,2,0,0.0
http://blogs.edf.org/energyexchange/2017/08/25/heres-how-chevrons-next-ceo-can-turn-over-a-new-leaf/,2,2,0.5
http://bruegel.org/2017/06/nord-stream-2-can-wait/,2,0,0.0
http://climatecasechart.com/case/people-state-california-v-bp-plc-oakland/,2,2,0.108911
http://ens-newswire.com/2013/12/20/russia-grants-amnesty-to-greenpeacers-gazprom-arctic-oil-flows/,2,0,0.0
http://fortune.com/2016/05/24/exxonmobil-chevron-shareholder-meetings-climate/,2,2,0.151515
http://microgridmedia.com/abb-indian-institute-technology-microgrid-partnership-boost-climate-smart-rural-electrification-drive/,2,2,0.346154
http://novabus.com/2017/02/09/abb-nova-bus-announce-collaboration-electric-transportation/,2,2,0.433333
http://reports.shell.com/annual-report/2017/strategic-report/climate-change-and-energy-transition/risks-and-opportunities.php,2,2,0.352273
http://reports.weforum.org/global-strategic-foresight/katell-le-goulven-unicef-agile-development/,0,0,0.0


## Climate Confidence Baseline:  
Based on BoW
Distribution:

In [7]:
df['Answer.ClimateConfidence'].value_counts(normalize=True)

2    0.740385
1    0.163462
0    0.096154
Name: Answer.ClimateConfidence, dtype: float64

In [13]:
X = df["text"]
y = df["Answer.ClimateConfidence"]
test_size = 0.2
tf_params = {"lowercase": True,
             "analyzer": "char_wb",
             "stop_words": stop,
             "ngram_range": (3, 3),
             "min_df": 1,
             "max_df": 1.0,
             "preprocessor": None,
             "max_features": 3500,
             "norm": None,
             "use_idf": True
             }
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
priors = y_train.value_counts(normalize=True).values
vectorizer = TfidfVectorizer(**tf_params)
train = vectorizer.fit_transform(X_train)
test = vectorizer.transform(X_test)
clf = MultinomialNB()
clf.fit(train.toarray(), y_train)
pred = clf.predict(test.toarray())
print(classification_report(y_test, pred, target_names=('0', '1', '2')))

             precision    recall  f1-score   support

          0       1.00      0.50      0.67         2
          1       0.00      0.00      0.00         3
          2       0.80      1.00      0.89        16

avg / total       0.70      0.81      0.74        21



  'precision', 'predicted', average, warn_for)


## Sentiment Baseline:  
Using the same BoW technique as above  
I exclude texts that are not related to the company or climate change

In [9]:
target_names=('Strongly Negative', 'Negative', 'Positive', 'Strongly Positive')
cleaned_df = df[(df['Answer.CompanyConfidence'] != 0) & (df['Answer.ClimateConfidence'] != 0)]
print("Total: {} texts".format(len(cleaned_df)))
cleaned_df['Answer.sentiment'].value_counts(normalize=True)

Total: 82 texts


Strongly Positive    0.365854
Positive             0.280488
Negative             0.243902
Strongly Negative    0.085366
Neutral              0.024390
Name: Answer.sentiment, dtype: float64

In [10]:
#Temporary remove Neutral as its too small
cleaned_df = cleaned_df[cleaned_df['Answer.sentiment'] != 'Neutral']
X = cleaned_df["text"]
y = cleaned_df["Answer.sentiment"]
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 64, Num. of test: 16


In [11]:
priors = y_train.value_counts(normalize=True).values
vectorizer = TfidfVectorizer(**tf_params)
train = vectorizer.fit_transform(X_train)
test = vectorizer.transform(X_test)
clf = MultinomialNB()
clf.fit(train.toarray(), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Test prediction

In [12]:
pred = clf.predict(test.toarray())
print(classification_report(y_test, pred, target_names=target_names))

                   precision    recall  f1-score   support

Strongly Negative       0.50      0.25      0.33         4
         Negative       0.14      0.20      0.17         5
         Positive       0.00      0.00      0.00         1
Strongly Positive       0.50      0.50      0.50         6

      avg / total       0.36      0.31      0.32        16

