In [1]:
# Preparation
# DataFrame
import pandas as pd
import numpy as np

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Preparation
nltk.download('stopwords')

# Preparation
# DATASET
DATASET_COLUMNS = ["ids", 'text', "target"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.9

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.3, 0.7)

# Import Neural Networks Model
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

w2kmodel = gensim.models.word2vec.Word2Vec.load(WORD2VEC_MODEL)

import pickle
with open(TOKENIZER_MODEL, 'rb') as handle:
    tokenizer = pickle.load(handle)
    
with open(ENCODER_MODEL, 'rb') as handle:
    encoder = pickle.load(handle) 
    
from keras.models import load_model
model = load_model('model.h5')

## Test Part
## Import Dataset
boca_sample = pd.read_csv("test_10_04.csv", encoding =DATASET_ENCODING)

def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

## Data Wrangling & Encoding 
convert = pad_sequences(tokenizer.texts_to_sequences(boca_sample.text), maxlen=SEQUENCE_LENGTH)

## Concerning Word Prediction
sample_s = model.predict(pad_sequences(convert, maxlen=SEQUENCE_LENGTH))

Flags = [decode_sentiment(score, include_neutral=True) for score in sample_s]

## Exporting EID
out = pd.DataFrame()
out['EID'] = boca_sample['ids']
out['text'] = boca_sample['text']
out['Flag'] = Flags

## Import Vader model
from nltk.sentiment import vader

## from nltk.sentiment import vader
nltk.download('vader_lexicon')
analysis = vader.SentimentIntensityAnalyzer()

vader_neg = []
for i in range(len(out)):
    ans = analysis.polarity_scores(out.iloc[i,1])
    if ans['neg'] > 0.1:
        vader_neg.append(0)
    else:
        vader_neg.append(4)
        
## Add Vader Prediction
out['Vader'] = vader_neg

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jl67386/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2019-10-04 11:00:22,140 : INFO : loading Word2Vec object from model.w2v
2019-10-04 11:00:22,706 : INFO : loading wv recursively from model.w2v.wv.* with mmap=None
2019-10-04 11:00:22,707 : INFO : setting ignored attribute vectors_norm to None
2019-10-04 11:00:22,709 : INFO : loading vocabulary recursively from model.w2v.vocabulary.* with mmap=None
2019-10-04 11:00:22,710 : INFO : loading trainables recursively from model.w2v.trainables.* with mmap=None
2019-10-04 11:00:22,712 : INFO : setting ignored attribute cum_t

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use tf.cast instead.
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jl67386/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
## High priority -- both models predict as "WARNING"
EIDs = out[(out['Flag'] == "NEGATIVE") & (out['Vader'] == 0)]['EID'].drop_duplicates()
EIDs = pd.DataFrame(EIDs)
EIDs = EIDs.dropna()
EIDs.to_csv("EIDs_high_10_04.csv", index = False)

In [3]:
## Lower priority -- Neural Network model predict as "WARNING"
EIDs = out[(out['Flag'] == "NEGATIVE") & (out['Vader'] != 0)]['EID'].drop_duplicates()
EIDs = pd.DataFrame(EIDs)
EIDs = EIDs.dropna()
EIDs.to_csv("EIDs_low_10_04.csv", index = False)

In [3]:
## High priority -- both models predict as "WARNING"
EIDs = out[(out['Flag'] == "NEGATIVE") & (out['Vader'] == 0)]['EID'].drop_duplicates()
EIDs = pd.DataFrame(EIDs)
EIDs = EIDs.dropna()
EIDs.to_csv("EIDs_Q39.csv", index = False)

In [9]:
## High priority -- both models predict as "WARNING"
EIDs = out[(out['Flag'] == "NEGATIVE") & (out['Vader'] == 0)]['EID'].drop_duplicates()
EIDs = pd.DataFrame(EIDs)
EIDs = EIDs.dropna()
EIDs.to_csv("EIDs_xy_high.csv", index = False)

In [7]:
## Lower priority -- Neural Network model predict as "WARNING"
EIDs = out[(out['Flag'] == "NEGATIVE") & (out['Vader'] != 0)]['EID'].drop_duplicates()
EIDs = pd.DataFrame(EIDs)
EIDs = EIDs.dropna()
EIDs.to_csv("EIDs_2_slow.csv", index = False)