In [6]:

import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import unicodedata
from bs4 import BeautifulSoup
INPUT_FILE = "../1-Data/3-annotation/output.csv"
alternative_company_names = {"AMD (Advanced Micro Devices)": "AMD",
                    'Royal Dutch Shell PLC': "Shell",
                    "Samsung Electronics Co., Ltd.": "Samsung",
                    "Goodyear Tire & Rubber Co": "Goodyear",
                    "Sumitomo Rubber Industries": "Sumitomo",
                    "Exxon Mobil Corp.": "ExxonMobil",
                    "General Motors Corp.": "GM",
                    "Ford Motor Co.": "Ford",
                    "Toyota Motor Corp.": "Toyota",
                    "Petro China": "PetroChina",
                    'Volkswagen AG': "VW"}

def clean_text(html):
    soup = BeautifulSoup(html, "html.parser") # create a new bs4 object from the html data loaded
    for script in soup(["script", "style"]): # remove all javascript and stylesheet code
        script.extract()
    texts = soup.findAll(text=True)
#     import ipdb; ipdb.set_trace()
#     print(len(texts))
    text = ". ".join(t.strip() for t in texts)
    text = unicodedata.normalize("NFKD", text)
    return text


In [9]:
df = pd.read_csv(INPUT_FILE)
df['text'] = df.apply(lambda row: "{} {}".format(row['title'], clean_text(str(row['content']))), axis=1)
df.drop(df[df.text.str.len() < 150].index, inplace=True)

cleaned_df = df[(df['company_confidence'] != 0) & (df['climate_confidence'] != 0)]
cleaned_df = cleaned_df.dropna(subset =["sentiment"])

# cleaned_df.dropna(subset=['sentiment'], how='all', inplace = True)
simple_dc = {"Strongly Negative": '0', "Negative": '0', "Neutral": '2', "Positive": '1', "Strongly Positive": '1'}
cleaned_df['simple_sentiment'] = cleaned_df.apply(lambda row: simple_dc[row['sentiment']] ,axis=1)

In [25]:
data = cleaned_df
data['text']= [x.lower() for x in data['text']]
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

tokenizer = Tokenizer(nb_words=5000, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                   lower=True,split=' ')
tokenizer.fit_on_texts(data["text"].values)

X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

embed_dim = 128
lstm_out = 196
batch_size = 10

model = Sequential()
model.add(Embedding(5000, embed_dim,input_length = X.shape[1], dropout = 0.1))
model.add(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.1))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Y = pd.get_dummies(data['simple_sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
model.fit(X_train, Y_train, batch_size = batch_size, epochs = 1, verbose = 2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("Score: %.2f" % (score))
print("Validation Accuracy: %.2f" % (acc))

# #NN 5 cats
# Y = pd.get_dummies(data['sentiment']).values
# X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)
# print(X_train.shape,Y_train.shape)
# print(X_test.shape,Y_test.shape)

# embed_dim = 128
# lstm_out = 196
# batch_size = 32

# model = Sequential()
# model.add(Embedding(5000, embed_dim,input_length = X.shape[1], dropout = 0.2))
# model.add(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.2))
# model.add(Dense(3,activation='softmax'))
# model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
# print(model.summary())
# model.fit(X_train, Y_train, batch_size = batch_size, nb_epoch = 5, verbose = 2)


# score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
# print("Score: %.2f" % (score))
# print("Validation Accuracy: %.2f" % (acc))




# embed_dim = 128
# lstm_out = 196
# batch_size = 32

# model = Sequential()
# model.add(Embedding(2000, embed_dim,input_length = X.shape[1], dropout = 0.2))
# model.add(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.2))
# model.add(Dense(3,activation='softmax'))
# model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
# print(model.summary())



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 6291, 128)         640000    
_________________________________________________________________
lstm_16 (LSTM)               (None, 196)               254800    
_________________________________________________________________
dense_16 (Dense)             (None, 3)                 591       
Total params: 895,391
Trainable params: 895,391
Non-trainable params: 0
_________________________________________________________________
None
(203, 6291) (203, 3)
(51, 6291) (51, 3)
Epoch 1/1
 - 290s - loss: 1.0163 - acc: 0.5419
Score: 0.96
Validation Accuracy: 0.57


In [26]:
from sklearn.metrics import classification_report as clsr
y_pred = model.predict(X_test)
# print(clsr(Y_test, y_pred, target_names=[0,1,2]))

ValueError: Mix type of y not allowed, got types {'multilabel-indicator', 'continuous-multioutput'}

In [29]:
y_pred = model.predict_classes(X_test)
y_pred 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1])

In [45]:
embed_dim = 128
lstm_out = 96
batch_size = 10

model = Sequential()
model.add(Embedding(5000, embed_dim,input_length = X.shape[1], dropout = 0.2))
model.add(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Y = pd.get_dummies(data['simple_sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
model.fit(X_train, Y_train, batch_size = batch_size, nb_epoch = 8, verbose = 2)

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("Score: %.2f" % (score))
print("Validation Accuracy: %.2f" % (acc))
model.predict_classes(X_test)

  
  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 6291, 128)         640000    
_________________________________________________________________
lstm_20 (LSTM)               (None, 96)                86400     
_________________________________________________________________
dense_20 (Dense)             (None, 3)                 291       
Total params: 726,691
Trainable params: 726,691
Non-trainable params: 0
_________________________________________________________________
None
(203, 6291) (203, 3)
(51, 6291) (51, 3)




Epoch 1/8
 - 257s - loss: 1.0554 - acc: 0.5468
Epoch 2/8
 - 196s - loss: 0.9450 - acc: 0.5567
Epoch 3/8
 - 212s - loss: 0.8234 - acc: 0.5567
Epoch 4/8
 - 225s - loss: 0.5559 - acc: 0.7635
Epoch 5/8
 - 204s - loss: 0.3523 - acc: 0.9409
Epoch 6/8
 - 249s - loss: 0.2550 - acc: 0.8966
Epoch 7/8
 - 230s - loss: 0.1669 - acc: 0.9951
Epoch 8/8
 - 233s - loss: 0.0749 - acc: 0.9901
Score: 0.97
Validation Accuracy: 0.65


array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 2, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1])

In [46]:
df = pd.read_csv("../3-Improvement/filtered_by_company_confidence.csv")
df.columns
df['text'] = df.apply(lambda row: "{} {}".format(row['title'], clean_text(str(row['content']))), axis=1)
df.drop(df[df.text.str.len() < 150].index, inplace=True)

In [47]:
companies = df.groupby("company", as_index=False)
rows_list = []
for num, cp in enumerate(list(companies.groups.keys())):
    neg_url = ''
    pos_url = ''
    cp_df = df.loc[df['company'] == cp]
#     cp_df['lemmatized'] = cp_df.apply(lambda row: " ".join(tokenize(row['text'], get_company_names(row[""]))), axis=1)
    urls = list(cp_df['url'])
#     cp_df['text'] = cp_df.apply(lambda row: "{} {}".format(row[''], clean_text(str(row['content']))), axis=1)
    
    res = model.predict_classes(cp_df['text'])
    counter = Counter(res)
    probs = model.predict_proba(cp_df['text'])
    
    negs, pos, neut = zip(*probs)
    neg_url = urls[negs.index(max(negs))]
    pos_url = urls[pos.index(max(pos))]

    rows_list.append({"company": cp, "pos_c":Counter(res)[1], "neg_c": Counter(res)[0], 'pos_p': sum(pos), 'neg_p': sum(negs), 'neg_url': f"<a href='{neg_url}' target='_blank'>Clickme</a>", 'pos_url': f"<a href='{pos_url}' target='_blank'>Clickme</a>"})#, "neg_url": neg_url,"pos_url": pos_url })
    print(f"{cp}: p: {Counter(res)[1]}, n: {Counter(res)[0]},  'pos_p': {sum(pos)}, 'neg_p': {sum(negs)}, neg_url: {neg_url},  pos_url: {pos_url},")
    
final_df =  pd.DataFrame(rows_list) 

ValueError: Error when checking : expected embedding_20_input to have shape (6291,) but got array with shape (1,)

In [49]:
model.predict_classes(cp_df['text'])

ValueError: Error when checking : expected embedding_20_input to have shape (6291,) but got array with shape (1,)

In [44]:
print(clsr(Y_test, y_pred, target_names=[0,1,2]))

ValueError: Mix of label input types (string and number)

In [52]:

X_t = tokenizer.texts_to_sequences(cp_df['text'].values)
X_t = pad_sequences(X_t)


In [54]:
model.predict_classes(X_t)

ValueError: Error when checking : expected embedding_20_input to have shape (6291,) but got array with shape (2253,)