In [39]:
import pandas as pd
import re
import spacy
from collections import Counter
import unicodedata
from bs4 import BeautifulSoup
from boilerpipe.extract import Extractor
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from nltk.corpus import stopwords
stopwords = set(stopwords.words())
INPUT_FILE = "../1-Data/3-annotation/output.csv"
# INPUT_FILE = "filtered_by_company_confidence.csv"

nlp = spacy.load('en_core_web_sm')
alternative_company_names = {"AMD (Advanced Micro Devices)": "AMD",
                    'Royal Dutch Shell PLC': "Shell",
                    "Samsung Electronics Co., Ltd.": "Samsung",
                    "Goodyear Tire & Rubber Co": "Goodyear",
                    "Sumitomo Rubber Industries": "Sumitomo",
                    "Exxon Mobil Corp.": "ExxonMobil",
                    "General Motors Corp.": "GM",
                    "Ford Motor Co.": "Ford",
                    "Toyota Motor Corp.": "Toyota",
                    "Petro China": "PetroChina",
                    'Volkswagen AG': "VW"}

def clean_text(html):
    soup = BeautifulSoup(html, "html.parser") # create a new bs4 object from the html data loaded
    for script in soup(["script", "style"]): # remove all javascript and stylesheet code
        script.extract()
    texts = soup.findAll(text=True)
#     import ipdb; ipdb.set_trace()
#     print(len(texts))
    text = ". ".join(t.strip() for t in texts)
    text = unicodedata.normalize("NFKD", text)
    return text

Clean text from html tags

In [40]:
df = pd.read_csv(INPUT_FILE)

In [41]:
len(df)

410

In [42]:
df = pd.read_csv(INPUT_FILE)
df['text'] = df.apply(lambda row: "{} {}".format(row['title'], clean_text(str(row['content']))), axis=1)
df.drop(df[df.text.str.len() < 150].index, inplace=True)

In [43]:
len(df)

393

In [44]:
# COMPANY_NAMES_STOP_WORDS = "PLC|Corp"
with open("company-suffix.txt", "r") as fl:
    text = [i for i in fl.read().split('\n') if not i.startswith('//')]
    COMPANY_NAMES_STOP_WORDS = "(" + "$)|(".join(text) + "$)"

In [45]:
def get_company_names(company):
    company_names = [re.sub(COMPANY_NAMES_STOP_WORDS, '', company, flags=re.IGNORECASE).strip().lower()]
    if company in alternative_company_names:
        company_names.append(alternative_company_names[company].lower())
    #Company is often mentioned by part of it's name. e.g. "Royal Dutch Shell" -> "Shell"
#     company_names = set([company] + [i for i in company.split() if len(i)>2])
    return company_names

In [46]:
# pd.options.display.max_colwidth = 10000
# row = df.loc[df['url'] == 'http://www.sustainablebrands.com/solutionproviders/basf']
# row
len(df)

393

In [47]:
target_names=('Strongly Negative', 'Negative', 'Neutral', 'Positive', 'Strongly Positive')
cleaned_df = df[(df['company_confidence'] != 0) & (df['climate_confidence'] != 0)]
cleaned_df = cleaned_df.dropna(subset =["sentiment"])

# cleaned_df.dropna(subset=['sentiment'], how='all', inplace = True)
simple_dc = {"Strongly Negative": '0', "Negative": '0', "Neutral": '2', "Positive": '1', "Strongly Positive": '1'}
cleaned_df['simple_sentiment'] = cleaned_df.apply(lambda row: simple_dc[row['sentiment']] ,axis=1)
print("Total: {} texts".format(len(cleaned_df)))
print("Sentiment counts:")
print(cleaned_df['sentiment'].value_counts(normalize=True))
print("Simple Sentiment counts:")
print(cleaned_df['simple_sentiment'].value_counts(normalize=True))

Total: 242 texts
Sentiment counts:
Positive             0.442149
Negative             0.260331
Neutral              0.148760
Strongly Positive    0.082645
Strongly Negative    0.066116
Name: sentiment, dtype: float64
Simple Sentiment counts:
1    0.524793
0    0.326446
2    0.148760
Name: simple_sentiment, dtype: float64


# BoW Again  

taken from https://gist.github.com/bbengfort/044682e76def583a12e6c09209c664a1

In [48]:
import os
import time
import string
import pickle

from operator import itemgetter

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report as clsr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split as tts


def timeit(func):
    """
    Simple timing decorator
    """
    def wrapper(*args, **kwargs):
        start  = time.time()
        result = func(*args, **kwargs)
        delta  = time.time() - start
        return result, delta
    return wrapper


def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg


class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    """
    Transforms input data by using NLTK tokenization, lemmatization, and
    other normalization and filtering techniques.
    """

    def __init__(self, stopwords=None, punct=None, lower=True, strip=True):
        """
        Instantiates the preprocessor, which make load corpora, models, or do
        other time-intenstive NLTK data loading.
        """
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = set(stopwords) if stopwords else set(sw.words('english'))
        self.punct      = set(punct) if punct else set(string.punctuation)
        self.punct.add("“")
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        """
        Fit simply returns self, no other information is needed.
        """
        return self

    def inverse_transform(self, X):
        """
        No inverse transformation
        """
        return X

    def transform(self, X):
        """
        Actually runs the preprocessing on each document.
        """
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        """
        Returns a normalized, lemmatized list of tokens from a document by
        applying segmentation (breaking into sentences), then word/punctuation
        tokenization, and finally part of speech tagging. It uses the part of
        speech tags to look up the lemma in WordNet, and returns the lowercase
        version of all the words, removing stopwords and punctuation.
        """
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If punctuation or stopword, ignore token and continue
                if token in self.stopwords or all(char in self.punct for char in token):
                    continue
                if tag == "NNP":
                    continue
                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        """
        Converts the Penn Treebank tag to a WordNet POS tag, then uses that
        tag to perform much more accurate WordNet lemmatization.
        """
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)



@timeit
def build_and_evaluate(X, y, classifier=SGDClassifier, outpath=None, verbose=True):
    """
    Builds a classifer for the given list of documents and targets in two
    stages: the first does a train/test split and prints a classifier report,
    the second rebuilds the model on the entire corpus and returns it for
    operationalization.
    X: a list or iterable of raw strings, each representing a document.
    y: a list or iterable of labels, which will be label encoded.
    Can specify the classifier to build with: if a class is specified then
    this will build the model with the Scikit-Learn defaults, if an instance
    is given, then it will be used directly in the build pipeline.
    If outpath is given, this function will write the model as a pickle.
    If verbose, this function will print out information to the command line.
    """

    @timeit
    def build(classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """
        if isinstance(classifier, type):
            classifier = classifier(loss='log')

        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
            ('classifier', classifier),
        ])

        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Begin evaluation
    if verbose: print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    model, secs = build(classifier, X_train, y_train)

    if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs))
    if verbose: print("Classification Report:\n")

    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred, target_names=labels.classes_))

    if verbose: print("Building complete model and saving ...")
    model, secs = build(classifier, X, y)
    model.labels_ = labels

    if verbose: print("Complete model fit in {:0.3f} seconds".format(secs))

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return model


def show_most_informative_features(model, text=None, n=20):
    """
    Accepts a Pipeline with a classifer and a TfidfVectorizer and computes
    the n most informative features of the model. If text is given, then will
    compute the most informative features for classifying that text.
    Note that this function will only work on linear models with coefs_
    """
    # Extract the vectorizer and the classifier from the pipeline
    vectorizer = model.named_steps['vectorizer']
    classifier = model.named_steps['classifier']

    # Check to make sure that we can perform this computation
    if not hasattr(classifier, 'coef_'):
        raise TypeError(
            "Cannot compute most informative features on {} model.".format(
                classifier.__class__.__name__
            )
        )

    if text is not None:
        # Compute the coefficients for the text
        tvec = model.transform([text]).toarray()
    else:
        # Otherwise simply use the coefficients
        tvec = classifier.coef_

    # Zip the feature names with the coefs and sort
    coefs = sorted(
        zip(tvec[0], vectorizer.get_feature_names()),
        key=itemgetter(0), reverse=False
    )

    topn  = zip(coefs[:n], coefs[:-(n+1):-1])

    # Create the output string to return
    output = []

    # If text, add the predicted value to the output.
    if text is not None:
        output.append("\"{}\"".format(text))
        output.append("Classified as: {}".format(model.predict([text])))
        output.append("")

    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in topn:
        output.append(
            "{:0.4f}{: >15}    {:0.4f}{: >15}".format(cp, fnp, cn, fnn)
        )

    return "\n".join(output)


In [49]:
get_company_names(list(cleaned_df["company"])[1])
# list(cleaned_df["company"])[1]

['basf']

In [50]:
# get_company_names(list(cleaned_df["company"])[10])
# cleaned_df['lemmatized'] = cleaned_df.apply(lambda row: " ".join(tokenize(row['text'], get_company_names(row["company"]))), axis=1)

In [51]:
# list(cleaned_df["lemmatized"])[0]
# cleaned_df["sentiment"].unique()

In [52]:
X = cleaned_df["text"]
y = cleaned_df["sentiment"]
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
model, secs = build_and_evaluate(X,y)
print(show_most_informative_features(model))

Building for evaluation




Evaluation model fit in 21.489 seconds
Classification Report:

                   precision    recall  f1-score   support

         Negative       0.75      0.50      0.60        18
          Neutral       0.20      0.25      0.22         4
         Positive       0.67      0.87      0.75        23
Strongly Negative       0.00      0.00      0.00         1
Strongly Positive       0.00      0.00      0.00         3

      avg / total       0.60      0.61      0.59        49

Building complete model and saving ...
Complete model fit in 24.301 seconds
-3.4183            cut    4.2006           coal
-2.8444             ”.    3.5445           file
-2.8210            use    3.2368      fiduciary
-2.8201           well    3.0694            sea
-2.8161         ruling    3.0612        company
-2.7291       emission    2.8979        general
-2.7215          today    2.8921          shale
-2.6423       business    2.8772       refinery
-2.6237         reduce    2.7031       ministry
-2.5623      



In [53]:
X = cleaned_df["text"]
y = cleaned_df["simple_sentiment"]
# test_size = 0.2
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
#                                                     stratify=y)
model, secs = build_and_evaluate(X,y, outpath="model.pickle")
print(show_most_informative_features(model))

Building for evaluation




Evaluation model fit in 19.489 seconds
Classification Report:

             precision    recall  f1-score   support

          0       0.85      0.69      0.76        16
          1       0.76      0.96      0.85        26
          2       0.33      0.14      0.20         7

avg / total       0.73      0.76      0.73        49

Building complete model and saving ...
Complete model fit in 23.796 seconds
Model written out to model.pickle
-3.0646       electric    4.8334            oil
-2.9960         energy    4.0591       fracking
-2.8516       business    4.0484           coal
-2.7830          court    3.5703       sanction
-2.7705            use    3.4225          shale
-2.5381        vehicle    3.3215        company
-2.5043          power    2.7926          tonne
-2.4668        percent    2.7369           file
-2.4271          issue    2.6320      fiduciary
-2.2902      renewable    2.6318         report
-2.2831         global    2.5883        scandal
-2.2749         ruling    2.585



# Okay, let's estimate 50 companies

In [54]:
import pickle
with open("model.pickle", "rb") as fl:
    model = pickle.load(fl)
df = pd.read_csv("../1-Data/3-annotation/compiled_output.csv")
df.columns
df['text'] = df.apply(lambda row: "{} {}".format(row['title'], clean_text(str(row['content']))), axis=1)
df.drop(df[df.text.str.len() < 150].index, inplace=True)

In [55]:
len(df)

3673

In [56]:
companies = df.groupby("company", as_index=False)
rows_list = []
for num, cp in enumerate(list(companies.groups.keys())):
    neg_url = ''
    pos_url = ''
    cp_df = df.loc[df['company'] == cp]
#     cp_df['lemmatized'] = cp_df.apply(lambda row: " ".join(tokenize(row['text'], get_company_names(row[""]))), axis=1)
    urls = list(cp_df['url'])
#     cp_df['text'] = cp_df.apply(lambda row: "{} {}".format(row[''], clean_text(str(row['content']))), axis=1)
    
    res = model.predict(cp_df['text'])
    counter = Counter(res)
    probs = model.predict_proba(cp_df['text'])
    
    negs, pos, neut = zip(*probs)
    neg_url = urls[negs.index(max(negs))]
    pos_url = urls[pos.index(max(pos))]

    rows_list.append({"company": cp, "pos_c":Counter(res)[1], "neg_c": Counter(res)[0], 'pos_p': sum(pos), 'neg_p': sum(negs), 'neg_url': f"<a href='{neg_url}' target='_blank'>Clickme</a>", 'pos_url': f"<a href='{pos_url}' target='_blank'>Clickme</a>"})#, "neg_url": neg_url,"pos_url": pos_url })
    print(f"{cp}: p: {Counter(res)[1]}, n: {Counter(res)[0]},  'pos_p': {sum(pos)}, 'neg_p': {sum(negs)}, neg_url: {neg_url},  pos_url: {pos_url},")
    
final_df =  pd.DataFrame(rows_list) 

ABB: p: 54, n: 0,  'pos_p': 46.17268510173781, 'neg_p': 2.8735556377522933, neg_url: https://www.theguardian.com/travel/2012/may/13/st-abbs-scottish-borders-walk,  pos_url: http://novabus.com/2017/02/09/abb-nova-bus-announce-collaboration-electric-transportation/,
AMD (Advanced Micro Devices): p: 57, n: 0,  'pos_p': 44.48950614639514, 'neg_p': 7.481047660856088, neg_url: https://www.thenational.ae/business/markets/mubadala-investment-sells-shares-in-amd-1.616794,  pos_url: https://obamawhitehouse.archives.gov/the-press-office/2015/11/30/white-house-announces-additional-commitments-american-business-act,
Agnico-Eagle Mines Ltd.: p: 31, n: 8,  'pos_p': 22.068686532675862, 'neg_p': 11.644857206665176, neg_url: https://www.elementascience.org/articles/10.1525/elementa.281/,  pos_url: https://www.agnicoeagle.com/English/sustainability/standards/default.aspx,
Amazon.com Inc.: p: 49, n: 1,  'pos_p': 38.271009333957856, 'neg_p': 7.25637231508499, neg_url: https://www.investopedia.com/news/amaz

Ineos: p: 8, n: 41,  'pos_p': 8.412232353803839, 'neg_p': 38.137710628782415, neg_url: http://powerbase.info/index.php/Ineos,  pos_url: https://friendsoftheearth.uk/climate-change/rejection-ineos-test-drill-application-south-yorkshire-welcomed,
International Paper Corp.: p: 30, n: 10,  'pos_p': 25.337729680895617, 'neg_p': 11.735877558218363, neg_url: https://www.independent.co.uk/environment/exxonmobil-climate-change-oil-gas-fossil-fuels-global-warming-harvard-a7908541.html,  pos_url: https://www.softbank.jp/en/corp/csr/future/instance_04/contents_02/,
Johnson & Johnson: p: 68, n: 4,  'pos_p': 48.78110216928309, 'neg_p': 15.505550122544319, neg_url: http://www.politifact.com/wisconsin/statements/2016/aug/12/ron-johnson/no-climate-warming-quite-few-years-gop-sen-ron-joh/,  pos_url: http://healthforhumanityreport.jnj.com/climate-and-energy,
Kimberly-Clark Corp.: p: 57, n: 2,  'pos_p': 46.97649202330911, 'neg_p': 7.4531018675779, neg_url: https://insideclimatenews.org/news/04102017/green

In [57]:
categories = {"Food": ["Cargill Inc", "Nestle SA", "Procter & Gamble, Co.", "PepsiCo Inc.", "Bunge"],
            "Chemicals": ["BASF AG", "Johnson & Johnson", "Bayer AG", "Ineos", "Dow Chemical"],
            "Electronics": ["Samsung Electronics Co., Ltd.", "Sony Corp.", "AMD (Advanced Micro Devices)", "Toshiba Corp.", "Motorola Inc."],
            "Metals & mining": ["Glencore International AG", "ArcelorMittal", "Agnico-Eagle Mines Ltd.", "ThyssenKrupp Stahl", "Arcelor SA"],
            "Tires": ["Michelin", "Bridgestone Corp.", "Continental AG", "Goodyear Tire & Rubber Co", "Sumitomo Rubber Industries"],
            "Energy & water": ["GazProm", "Duke Energy", "Petrobras", "Petro China", "RosNeft", "BP plc", "Exxon Mobil Corp.", "Total SA", 'Royal Dutch Shell PLC', "Chevron Corp."],
            "Automotive & transport": ["General Motors Corp.", "BMW", "ABB", "Daimler AG", "Ford Motor Co.", "Toyota Motor Corp.", "Volkswagen AG"],
            "Wood & paper products": ["International Paper Corp.", "Georgia Pacific", "Weyerhaeuser Co.", "Stora Enso Oyj", "Kimberly-Clark Corp."],
            "Computer services & software": ["IBM", "Microsoft Corp.", "EDS Corp.", "AuFeminin.com", "Amazon.com Inc.", "Apple Inc"],
#             "Dirty Dozen": ["GazProm", "Petrobras", "RosNeft", "Petro China", "Duke Energy"]             
}
companies_dc = {}
for cat, companies in categories.items():
    for c in companies:
        companies_dc[c] = cat
cats = list(categories.keys())
final_df['category'] = final_df.apply(lambda row: companies_dc[row['company']], axis=1)
import matplotlib.pyplot as plt
from matplotlib import colors
def get_cmap(n, name='hsv'):
    '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 
    RGB color; the keyword argument name must be a standard mpl colormap name.'''
    return plt.cm.get_cmap(name, n)
cmap = get_cmap(len(cats))
#http://www.climatechangenews.com/2015/05/21/the-dirty-dozen-the-fossil-fuel-industrys-polluting-league-table/
dirty_dozen = ["GazProm", "Petrobras", "RosNeft", "Petro China", "Glencore International AG", 'Royal Dutch Shell PLC', "Exxon Mobil Corp.", "BP plc", "Chevron Corp."]

def highlight_categories(row):
    return [f'background-color: {colors.rgb2hex(cmap(cats.index(row["category"])))}' for i in row]

def highlight_dirty_dozen(row):
    if row["company"] in dirty_dozen:
        return ['background-color: red' for i in row]
    else:
        return ["" for i in row]


In [58]:
# lemmatizer = WordNetLemmatizer()
# stopwords = set(sw.words('english'))
# def lemmatize(token, tag):
#     """
#     Converts the Penn Treebank tag to a WordNet POS tag, then uses that
#     tag to perform much more accurate WordNet lemmatization.
#     """
#     tag = {
#         'N': wn.NOUN,
#         'V': wn.VERB,
#         'R': wn.ADV,
#         'J': wn.ADJ
#     }.get(tag[0], wn.NOUN)

#     return lemmatizer.lemmatize(token, tag)

# def tokenize(document, company_names):
#     doc_tokens = []

#     for sent in sent_tokenize(document):
#         company_in_sent = False
#         sent_tokens = []
#         # Break the sentence into part of speech tagged tokens
#         for token, tag in pos_tag(wordpunct_tokenize(sent)):
#             # Apply preprocessing to the token
#             token = token.lower()
#             token = token.strip()
#             token = token.strip('_')
#             token = token.strip('*')

#             # If punctuation or stopword, ignore token and continue
# #             import ipdb; ipdb.set_trace()
#             if token in stopwords or all(char in set(string.punctuation) for char in token):
#                 continue
#             if tag == "NNP":
#                 continue
#             # Lemmatize the token and yield
#             lemma = lemmatize(token, tag)
#             if lemma in company_names:
#                 company_in_sent = True
#             sent_tokens.append(lemma)
#         if company_in_sent:
            
#             doc_tokens.extend(sent_tokens)
# #                 break
# #     if len(doc_tokens) == 0:
# #         print(document)
# #         print(company_names)
# #         import ipdb; ipdb.set_trace()
# #     print(company_names[0], len(doc_tokens))
#     return doc_tokens

### Order by absolute number of negatives/positives.  Highlight sectors

In [59]:
final_df['mark'] = final_df['neg_c']/final_df['pos_c']
final_df = final_df.sort_values(['mark'], ascending=[0])
# final_df['pos_url'] = final_df.apply(lambda row: f"<a href='{row['pos_url']}'>{row['pos_url']}</a>", axis=1)

final_df.style.apply(highlight_categories, axis=1)
# HTML(final_df.to_html(escape=False))
# final_df.style.apply(highlight_dirty_dozen, axis=1)

Unnamed: 0,company,neg_c,neg_p,neg_url,pos_c,pos_p,pos_url,category,mark
15,Chevron Corp.,98,82.8942,Clickme,16,26.0936,Clickme,Energy & water,6.125
41,RosNeft,84,67.9076,Clickme,15,22.3345,Clickme,Energy & water,5.6
29,Ineos,41,38.1377,Clickme,8,8.41223,Clickme,Chemicals,5.125
10,BP plc,27,22.7566,Clickme,10,11.0829,Clickme,Energy & water,2.7
21,Exxon Mobil Corp.,35,31.6182,Clickme,16,17.7757,Clickme,Energy & water,2.1875
23,GazProm,38,33.4435,Clickme,27,26.9264,Clickme,Energy & water,1.40741
42,Royal Dutch Shell PLC,66,59.8512,Clickme,47,45.74,Clickme,Energy & water,1.40426
26,Glencore International AG,15,13.3304,Clickme,18,15.9354,Clickme,Metals & mining,0.833333
39,Petrobras,47,45.8207,Clickme,58,51.0858,Clickme,Energy & water,0.810345
38,Petro China,32,32.757,Clickme,59,47.3584,Clickme,Energy & water,0.542373


# Naive validation: highlight Dirty Dozen

In [60]:
final_df.style.apply(highlight_dirty_dozen, axis=1)

Unnamed: 0,company,neg_c,neg_p,neg_url,pos_c,pos_p,pos_url,category,mark
15,Chevron Corp.,98,82.8942,Clickme,16,26.0936,Clickme,Energy & water,6.125
41,RosNeft,84,67.9076,Clickme,15,22.3345,Clickme,Energy & water,5.6
29,Ineos,41,38.1377,Clickme,8,8.41223,Clickme,Chemicals,5.125
10,BP plc,27,22.7566,Clickme,10,11.0829,Clickme,Energy & water,2.7
21,Exxon Mobil Corp.,35,31.6182,Clickme,16,17.7757,Clickme,Energy & water,2.1875
23,GazProm,38,33.4435,Clickme,27,26.9264,Clickme,Energy & water,1.40741
42,Royal Dutch Shell PLC,66,59.8512,Clickme,47,45.74,Clickme,Energy & water,1.40426
26,Glencore International AG,15,13.3304,Clickme,18,15.9354,Clickme,Metals & mining,0.833333
39,Petrobras,47,45.8207,Clickme,58,51.0858,Clickme,Energy & water,0.810345
38,Petro China,32,32.757,Clickme,59,47.3584,Clickme,Energy & water,0.542373


### Order by sum of probabilities negatives/positives

In [None]:
# final_df['mark'] = final_df['neg_p']/final_df['pos_p']
# final_df = final_df.sort_values(['mark'], ascending=[0])
# final_df.style.apply(highlight_categories, axis=1)

### Order by avg probability negatives/positives

In [None]:
# final_df['mark'] =  (final_df['pos_p']/final_df['pos_c'])- (final_df['neg_p']/final_df['neg_c'])
# final_df.sort_values(['mark'], ascending=[0])
# final_df.style.apply(highlight_categories, axis=1)

In [None]:
# print(f"length before: {len(df)}")
# df['title'] = df['']
# df['company'] = df['']
# df['text'] = df.apply(lambda row: "{} {}".format(row[''], clean_text(str(row['content']))), axis=1)

# for num, row in df.iterrows():
#     cc = find_company_confidence(row)
#     if cc == 0:
        

In [None]:
# df['company_confidence'] = df.apply(lambda row: find_company_confidence(row) ,axis=1)
# df = df[df['company_confidence'][0] != 0]
# print(f"length after: {len(df)}")

In [None]:
# df['company_confidence'] = df.apply(lambda row: row['company_confidence'][0] ,axis=1)

In [None]:
# df = df[df['company_confidence'] != 0]
# print(f"length after: {len(df)}")

In [None]:
# companies = df.groupby("", as_index=False)
# rows_list = []
# for num, cp in enumerate(list(companies.groups.keys())):
#     cp_df = df.loc[df[''] == cp]
#     res = model.predict(cp_df['text'])
#     rows_list.append({"company": cp, "pos":Counter(res)[1], "neg": Counter(res)[0], "mark": Counter(res)[0]/Counter(res)[1] })
#     print(f"{cp}: p: {Counter(res)[1]}, n: {Counter(res)[0]}, {round(Counter(res)[0]/Counter(res)[1],3)}")
    
# final_df =  pd.DataFrame(rows_list) 
# final_df.sort_values(['mark'], ascending=[0])

In [None]:
X = cleaned_df["text"]
y = cleaned_df["simple_sentiment"]
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42,
                                                    stratify=y)
print(f"Num. of train: {len(X_train)}, Num. of test: {len(X_test)}")
priors = y_train.value_counts(normalize=True).values
vectorizer = TfidfVectorizer(**tf_params)
train = vectorizer.fit_transform(X_train)
test = vectorizer.transform(X_test)
clf = MultinomialNB()
clf.fit(train.toarray(), y_train)
pred = clf.predict(test.toarray())
print(classification_report(y_test, pred, target_names=[0,1,2]))