In [54]:
import pandas as pd
import re
import spacy
from collections import Counter
from bs4 import BeautifulSoup
from boilerpipe.extract import Extractor
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from nltk.corpus import stopwords
stop = set(stopwords.words())
ANNOTATION_RESULTS = "../1-Data/3-annotation/output.csv"
df = pd.read_csv(ANNOTATION_RESULTS)
nlp = spacy.load('en_core_web_sm')

def cleanMe(html):
    soup = BeautifulSoup(html) # create a new bs4 object from the html data loaded
    for script in soup(["script", "style"]): # remove all javascript and stylesheet code
        script.extract()
    # get text
    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text
df['text'] = df.apply(lambda row: "{} {}".format(row['title'], cleanMe(row['full_text'])), axis=1)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


## Company Confidence Baseline:

In [55]:
COMPANY_NAMES_STOP_WORDS = r"PLC|Corp|Corporation|—$"
df['Answer.CompanyConfidence'].value_counts(normalize=True)

2    0.576271
1    0.271186
0    0.152542
Name: Answer.CompanyConfidence, dtype: float64

The baseline is built on frequency of mentions of the company, comparing to other organizations:  
2 - Very Confident that the text is about this company. Itі mention fraction => 10% and at least 2 mentions.  
1 - Moderate. It is mentioned but not the fraction < 10%.  
0 - Not related to the company. The company is not mentioned in text.  

In [56]:
company_confidence = []
company_fraction = []

for num, row in df.iterrows():
    company = re.sub(COMPANY_NAMES_STOP_WORDS, '', row['company'], flags=re.IGNORECASE).lower()
    #Company is often mentioned by part of it's name. e.g. "Royal Dutch Shell" -> "Shell"
    company_names = set([company] + company.split())
    text = row['text']
    doc = nlp(text)
    orgs = [re.sub(COMPANY_NAMES_STOP_WORDS, '', ent.text, flags=re.IGNORECASE).strip().lower() for ent in doc.ents if ent.label_ == "ORG"]
    orgs_counter = Counter(orgs)
    occurences_company = sum([orgs_counter.get(name, 0) for name in company_names])
    occurences_total = sum(orgs_counter.values())
    if occurences_total == 0:
        fraction = 0
    else:
        fraction = occurences_company/occurences_total
    company_fraction.append(fraction)
    if occurences_company == 0:
        company_confidence.append(0)
    elif fraction > 0.1 and occurences_company > 1:
        company_confidence.append(2)
    else:
        company_confidence.append(1)

            
df['company_confidence_estimation'] = company_confidence
df['company_fraction'] = company_fraction

In [57]:
print(classification_report(df['Answer.CompanyConfidence'], df['company_confidence_estimation'], target_names=('0', '1', '2')))

             precision    recall  f1-score   support

          0       0.88      0.78      0.82         9
          1       0.88      0.88      0.88        16
          2       0.97      1.00      0.99        34

avg / total       0.93      0.93      0.93        59



Highlight wrong values:

In [58]:
df.set_index(['url'], inplace=True)
def highlight_different(s):
    if s['Answer.CompanyConfidence'] != s['company_confidence_estimation']:
        return ['background-color: red' if v == s['company_confidence_estimation'] else '' for v in s ]
    return ['' for v in s]
df2 = df[['Answer.CompanyConfidence', 'company_confidence_estimation', 'company_fraction']]
df2.style.apply(highlight_different, axis=1)

Unnamed: 0_level_0,Answer.CompanyConfidence,company_confidence_estimation,company_fraction
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
http://blogs.edf.org/energyexchange/2017/08/25/heres-how-chevrons-next-ceo-can-turn-over-a-new-leaf/,2,2,0.5
http://climatecasechart.com/case/people-state-california-v-bp-plc-oakland/,2,2,0.108911
http://fortune.com/2016/05/24/exxonmobil-chevron-shareholder-meetings-climate/,2,2,0.151515
http://reports.shell.com/annual-report/2017/strategic-report/climate-change-and-energy-transition/risks-and-opportunities.php,2,2,0.352273
http://reports.weforum.org/global-strategic-foresight/katell-le-goulven-unicef-agile-development/,0,0,0.0
http://waterfootprint.org/en/about-us/news/news/coke-nike-call-climate-change-commercial-threat/,1,1,0.04
http://www.bdlaw.com/news-2211.html,2,2,0.117647
http://www.chicagotribune.com/business/sns-bc-us--california-climate-change-lawsuits-20180321-story.html,2,2,0.272727
http://www.climateactionprogramme.org/climate-leader-interviews/interview-with-ursula-mathar-vp-bmw-group,2,2,0.258065
http://www.climatekeys.com/climate-change-litigation-news,1,0,0.0


## Sentiment Baseline:  
Based on BoW

In [60]:
target_names=('Strongly Negative', 'Negative', 'Neutral', 'Positive', 'Strongly Positive')
df['Answer.sentiment'].value_counts(normalize=True)

Positive             0.288136
Negative             0.254237
Strongly Positive    0.203390
Neutral              0.152542
Strongly Negative    0.101695
Name: Answer.sentiment, dtype: float64

In [62]:
X = df["text"]
y = df["Answer.sentiment"]
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")

Num. of train: 47, Num. of test: 12


In [63]:
tf_params = {"lowercase": True,
             "analyzer": "char_wb",
             "stop_words": stop,
             "ngram_range": (3, 3),
             "min_df": 1,
             "max_df": 1.0,
             "preprocessor": None,
             "max_features": 3500,
             "norm": None,
             "use_idf": True
             }

In [67]:
priors = y_train.value_counts(normalize=True).values
vectorizer = TfidfVectorizer(**tf_params)
train = vectorizer.fit_transform(X_train)
test = vectorizer.transform(X_test)
clf = MultinomialNB()
clf.fit(train.toarray(), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Train Prediction

In [68]:
pred = clf.predict(train.toarray())
print(classification_report(y_train, pred, target_names=target_names))

                   precision    recall  f1-score   support

Strongly Negative       0.86      1.00      0.92        12
         Negative       1.00      0.86      0.92         7
          Neutral       1.00      0.92      0.96        13
         Positive       1.00      1.00      1.00         5
Strongly Positive       1.00      1.00      1.00        10

      avg / total       0.96      0.96      0.96        47



Test prediction

In [69]:
pred = clf.predict(test.toarray())
print(classification_report(y_test, pred, target_names=target_names))

                   precision    recall  f1-score   support

Strongly Negative       1.00      0.33      0.50         3
         Negative       1.00      0.50      0.67         2
          Neutral       0.44      1.00      0.62         4
         Positive       0.00      0.00      0.00         1
Strongly Positive       0.00      0.00      0.00         2

      avg / total       0.56      0.50      0.44        12



  'precision', 'predicted', average, warn_for)


## Company Confidence Baseline:  
Using the same BoW technique as above

In [73]:
y = df["Answer.ClimateConfidence"]
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
priors = y_train.value_counts(normalize=True).values
vectorizer = TfidfVectorizer(**tf_params)
train = vectorizer.fit_transform(X_train)
test = vectorizer.transform(X_test)
clf = MultinomialNB()
clf.fit(train.toarray(), y_train)
pred = clf.predict(test.toarray())
print(classification_report(y_test, pred, target_names=target_names))

                   precision    recall  f1-score   support

Strongly Negative       0.00      0.00      0.00         1
         Negative       0.00      0.00      0.00         2
          Neutral       0.75      1.00      0.86         9

      avg / total       0.56      0.75      0.64        12



  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
