In [1]:
import numpy as np 
import tensorflow as tf
import pandas as pd 
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import random
from tensorflow import set_random_seed
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam,SGD

from keras.models import Sequential
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
lemmatizer = WordNetLemmatizer()
set_random_seed(123)
random.seed(123)

Using TensorFlow backend.


In [2]:
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
dataset_filename = os.listdir("../input")[0]
dataset_path = os.path.join("..","input",dataset_filename)
print("Open file:", dataset_path)
train = pd.read_csv(dataset_path, encoding =DATASET_ENCODING , names=DATASET_COLUMNS)
train.head()


Open file: ../input/training.1600000.processed.noemoticon.csv


Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
def clean_sentences(df):
    df['review_text'] = df['text'].apply(lambda x:re.sub("[^a-zA-Z]"," ", x))
    df['review_text'] = df['review_text'].apply(lambda x:word_tokenize(x.lower()))
    df['review_text'] = df['review_text'].apply(lambda x:[lemmatizer.lemmatize(i) for i in x])
    return(list(df['review_text']))

#cleaned reviews for both train and test set retrieved
train_sentences = clean_sentences(train)

In [6]:
encoder = LabelEncoder()
encoder.fit(train.target.tolist())
y_target = encoder.transform(train.target.tolist())
y_target = y_target.reshape(-1,1)


x_train, x_test, y_train, y_test = train_test_split(train_sentences, y_target, test_size=0.25, random_state=2)

In [7]:
unique_words = set()
len_max = 0

for sent in tqdm(x_train):
    
    unique_words.update(sent)
    
    if(len_max<len(sent)):
        len_max = len(sent)
        
#length of the list of unique_words gives the no of unique words
print(len(list(unique_words)))
print(len_max)

100%|██████████| 1200000/1200000 [00:02<00:00, 437291.38it/s]

476286
53





In [8]:
tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(x_train))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)


#padding done to equalize the lengths of all input reviews. LSTM networks needs all inputs to be same length.
#Therefore reviews lesser than max length will be made equal using extra zeros at end. This is padding.
x_train = sequence.pad_sequences(x_train, maxlen=len_max)
x_test = sequence.pad_sequences(x_test, maxlen=len_max)
print(x_train.shape,x_test.shape)

(1200000, 53) (400000, 53)


In [9]:
early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max', monitor='val_acc', patience = 2)
callback = [early_stopping]

#Model using Keras LSTM
model=Sequential()
model.add(Embedding(len(list(unique_words)),300,input_length=len_max))
model.add(Dropout(0.2))
model.add(LSTM(128, dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=0.005),metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 53, 300)           142885800 
_________________________________________________________________
dropout_1 (Dropout)          (None, 53, 300)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 143,105,577
Trainable params: 143,105,577
Non-trainable params: 0
_________________________________________________________________


In [12]:
# callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
#               EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

In [11]:
history = model.fit(x_train, y_train,
                    batch_size=1024,
                    epochs=1,
                    validation_split=0.2,
                    verbose=1)
(loss, accuracy) =model.evaluate(x_test, y_test, batch_size=1024, verbose=1)
print("[INFO] loss={:.4f}, accuracy: {:.4f}%".format(loss,accuracy * 100))
model.save('my_sentiment_model.h5') 

  num_elements)


Train on 960000 samples, validate on 240000 samples
Epoch 1/1
[INFO] loss=0.3843, accuracy: 82.6352%


In [13]:
SENTIMENT_THRESHOLDS=np.array([0.4,0.7])
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = "NEUTRAL"
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = "NEGATIVE"
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = "POSITIVE"

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

In [16]:
def predict(text, include_neutral=True):
    
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=len_max)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)

    return {"label": label, "score": float(score)
       } 

In [20]:
print(predict("I am so happy"))
print(predict("Oh! No"))
print(predict("I am going out"))

{'label': 'POSITIVE', 'score': 0.9787561893463135}
{'label': 'NEGATIVE', 'score': 0.042689982801675797}
{'label': 'NEUTRAL', 'score': 0.6324208378791809}
