In [None]:
%tensorflow_version 1.15

# Recurrent Neural Network: Airline Sentiment

## Import depedencies

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

## Data preprocessing

### Get dataset

In [None]:
!wget 'https://raw.githubusercontent.com/fahmisalman/Sentiment-Analytics/master/dataset/Tweets.csv'

### Load dataset

In [None]:
df = pd.read_csv('Tweets.csv')

In [None]:
# Show top-5 row in dataset
df.head()

In [None]:
# define column and class
text = df['text']
label = df['airline_sentiment']

global class_label
class_label = ['positive', 'neutral', 'negative']

In [None]:
# Split dataset to train data and test data
X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.3, random_state=1, stratify=label)

In [None]:
# displays the contents of the first text
X_train[0]

In [None]:
# displays the contents of the first class
y_train[0]

### Tokenizer

In [None]:
# Convert sentence into tokens
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(text)

In [None]:
x_train_tokens = tokenizer.texts_to_sequences(X_train)

In [None]:
x_train_tokens[0]

In [None]:
[0, 0, 0, 0, 0, 0, ..., 1, 0, 0, 0, 0, ..., 0]

In [None]:
# displays the word for each token in the first sentence
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

def sequence_to_text(list_of_indices):
  # Looking up words in dictionary
  words = [reverse_word_map.get(letter) for letter in list_of_indices]
  return(words)

sequence_to_text(x_train_tokens[0])

In [None]:
# Maximum words in one sentence in the training data
print('Maximum review length: {}'.format(
len(max(x_train_tokens, key=len))))

# Minimum words in one sentence in the training data
print('Minimum review length: {}'.format(
len(min(x_train_tokens, key=len))))

In [None]:
# give padding for each sentence so that it has the same token length
x_train_tokens = pad_sequences(x_train_tokens, maxlen=100, padding='pre', truncating='pre')

In [None]:
x_train_tokens[0]

### One hot encoder

In [None]:
def one_hot_encoder(y):
  label = np.zeros([len(y), len(class_label)])
  for i in range(len(y)):
      label[i][class_label.index(y[i])] = 1
  return label

In [None]:
list(y_train)[0]

In [None]:
y_train_encoder = one_hot_encoder(list(y_train))
y_train_encoder[0]

## Build the model

![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/6/63/Long_Short-Term_Memory.svg/1920px-Long_Short-Term_Memory.svg.png)

Long Short Term Memory architecture

Source: https://en.wikipedia.org/wiki/Recurrent_neural_network

### Define the model

In [None]:
model = Sequential()
max_tokens = 100
embedding_size = 250
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='Embedding_layer'))
model.add(LSTM(units=16, name='LSTM_layer'))
model.add(Dense(3, activation='softmax', name='Output_layer'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
y_train_encoder = np.array(y_train_encoder)

In [None]:
model.summary()

### Fitting model

In [None]:
model.fit(x_train_tokens, y_train_encoder, epochs=5, validation_split=0.2)

## Save model

In [None]:
import pickle

In [None]:
with open('tokenizer.pickle', 'wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
model.save('model.h5')

## Load model

In [None]:
from tensorflow.python.keras.models import load_model

In [None]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
model = load_model('model.h5')

## Test the model

In [None]:
x_test_tokens = tokenizer.texts_to_sequences(X_test)
x_test_tokens = pad_sequences(x_test_tokens, maxlen=100, padding='pre', truncating='pre')

In [None]:
y_test = one_hot_encoder(list(y_test))

In [None]:
scores = model.evaluate(x_test_tokens, y_test, verbose=0)
print('Test accuracy:', scores[1])

## Predict sentence

In [None]:
def predict(sent):
  sent = tokenizer.texts_to_sequences([sent])
  sent = pad_sequences(sent, maxlen=100, padding='pre', truncating='pre')
  return class_label[int(model.predict_classes(sent))]

In [None]:
sent = '''@United rescheduled my return flight from #Japan?  Uh why?  Trying to call #UnitedAirlines #customerservice - 45 min wait.  #fun'''
predict(sent)