In [128]:
import re
import csv
import random
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Bidirectional, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from collections import Counter
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

np.random.seed(42)
tf.compat.v1.set_random_seed(42)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    if filenames:
        train_csv = os.path.join(dirname, filenames[2])
        test_csv = os.path.join(dirname, filenames[0])
        submission_csv = os.path.join(dirname, filenames[1])

print(train_csv)
print(test_csv)
print(submission_csv)

/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/sample_submission.csv


In [129]:
df_train = pd.read_csv(train_csv)
df_test = pd.read_csv(test_csv)
df_submission = pd.read_csv(submission_csv)

print(df_train.columns.values)
print(df_test.columns.values)
print(df_submission.columns.values)

['id' 'keyword' 'location' 'text' 'target']
['id' 'keyword' 'location' 'text']
['id' 'target']


In [130]:
def lemmatization(lemmatizer,sentence):
    lem = [lemmatizer.lemmatize(k) for k in sentence]
    lem = set(lem)
    return [k for k in lem]

def remove_stop_words(stopwords_list,sentence):
    return [k for k in sentence if k not in stopwords_list]

def preprocessed_headline(headlines):
    updated_headlines = []
    for headline in headlines:
        lemmatizer = WordNetLemmatizer()
        tokenizer = RegexpTokenizer(r'\w+')
        stopwords_list = stopwords.words('english')
        headline = headline.lower()
        remove_punc = tokenizer.tokenize(headline) # Remove puntuations
        remove_num = [re.sub('[0-9]', '', i) for i in remove_punc] # Remove Numbers
        remove_num = [i for i in remove_num if len(i)>0] # Remove empty strings
        lemmatized = lemmatization(lemmatizer,remove_num) # Word Lemmatization
        remove_stop = remove_stop_words(stopwords_list,lemmatized) # remove stop words
        updated_headline = ' '.join(remove_stop)
        updated_headlines.append(updated_headline)
    return np.array(updated_headlines)

def get_data():
    Xtrain = df_train['text'].values
    Ytrain = df_train['target'].values
    Xtest  = df_test['text'].values

    Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
    Xtrain = preprocessed_headline(Xtrain)
    Xtest = preprocessed_headline(Xtest)
    return Xtrain, Ytrain, Xtest

In [131]:
sentiment_weights = '/kaggle/working/model_weights.h5'
submission_path = '/kaggle/working/lstm_submission.csv'
vocab_size = 20000
oov_tok = '<OOV>'
max_length = 30
trunc_type = 'post'
embedding_dim = 512
num_epochs = 10
batch_size = 128
validation_split = 0.2


In [132]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if (logs.get('accuracy') > 0.995):
            print("\nReached 99.5% train accuracy.So stop training!")
            self.model.stop_training = True

class SentimentAnalyser:
    def __init__(self):
        Xtrain, Ytrain, Xtest = get_data()
        self.Xtrain = Xtrain
        self.Ytrain = Ytrain
        self.Xtest  = Xtest

    def tokenizing_data(self):
        tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
        tokenizer.fit_on_texts(self.Xtrain)

        Xtrain_seq = tokenizer.texts_to_sequences(self.Xtrain)
        
#         print(max([len(x) for x in Xtrain_seq])) #Find max length
        
        self.Xtrain_pad = pad_sequences(Xtrain_seq, maxlen=max_length, truncating=trunc_type)

        Xtest_seq  = tokenizer.texts_to_sequences(self.Xtest)
        self.Xtest_pad = pad_sequences(Xtest_seq, maxlen=max_length)
        self.tokenizer = tokenizer

    def embedding_model(self):

        model = Sequential()
        model.add(Embedding(output_dim=embedding_dim, input_dim=vocab_size, input_length=max_length))
        model.add(Bidirectional(LSTM(256, return_sequences=True)))
        model.add(Bidirectional(LSTM(128)))
        model.add(Dense(256, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(64, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        self.model = model

    def load_model(self):
        loaded_model = load_model(sentiment_weights)
        loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        self.model = loaded_model

    def train_model(self):
        callbacks = myCallback()
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        self.model.summary()
        self.model.fit(
            self.Xtrain_pad,
            self.Ytrain,
            batch_size=batch_size,
            epochs=num_epochs,
            validation_split=validation_split,
            callbacks= [callbacks]
            )

    def save_model(self):
        self.model.save(sentiment_weights)

    def predict(self):
        P = self.model.predict(self.Xtest_pad).squeeze()
        Ypred = (P > 0.5)
        df_submission['target'] = Ypred.astype(int)
        df_submission.to_csv(submission_path, index=False)
        
    def run(self):
        self.tokenizing_data()
        if os.path.exists(sentiment_weights):
            print("Loading Model")
            self.load_model()
        else:
            print("Saving Model")
            self.embedding_model()
            self.train_model()
            self.save_model()
            
        

In [133]:
model = SentimentAnalyser()
model.run()
model.predict()

Loading Model
[0.2900514602661133, 0.9473269581794739]
[False False  True ...  True  True  True]
