## Author: Aayush Mittal, Last edited:2/8/21

In [1]:
import pandas as pd
import re
import nltk
import numpy as np
from tensorflow.keras.layers import LSTM, Activation, Dropout, Dense, Input
from keras.layers.embeddings import Embedding
from tensorflow.keras.models import Model
import string
from sklearn.preprocessing import LabelBinarizer
import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.base import BaseEstimator, TransformerMixin

from keras.models import Sequential
from keras import layers
from tensorflow.keras.optimizers import RMSprop,Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

In [2]:
#importing the dataset and performing initial data preprocessing

df = pd.read_csv('airline_sentiment_analysis.csv')
df = df.loc[:,df.columns!="Unnamed: 0"]
df['airline_sentiment'] = df['airline_sentiment'].map({'positive': 1, 'negative': 0})

In [3]:
# This class removes @mention, url, puntuation, digits and stop words

class CleanText(BaseEstimator, TransformerMixin):
    def remove_mentions(self, input_text):
        return re.sub(r'@\w+', '', input_text)
    
    def remove_urls(self, input_text):
        return re.sub(r'http.?://[^\s]+[\s]?', '', input_text)
        
    def remove_punctuation(self, input_text):
        # Make translation table
        punct = string.punctuation
        trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
        return input_text.translate(trantab)

    def remove_digits(self, input_text):
        return re.sub('\d+', '', input_text)
    
    def to_lower(self, input_text):
        return input_text.lower()
    
    def remove_stopwords(self, input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 
    
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        clean_X = X.apply(self.remove_mentions).apply(self.remove_urls).apply(self.remove_punctuation).apply(self.remove_digits).apply(self.to_lower).apply(self.remove_stopwords)
        return clean_X

In [4]:
#seperating text and sentiment

ct = CleanText()
sr_clean = ct.transform(df.text)
df.text=sr_clean

text = df['text']

texts = []
for i in range(len(text)):
  texts.append(text[i])

y = df['airline_sentiment']
y=y.values

In [5]:
seq_lengths = df['text'].apply(lambda x: len(x.split(' ')))
seq_lengths.describe()

count    11541.000000
mean         9.307079
std          3.773709
min          1.000000
25%          7.000000
50%         10.000000
75%         12.000000
max         21.000000
Name: text, dtype: float64

In [6]:
max_words = 40000
max_len = 21

tokenizer = Tokenizer(nb_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=max_len)

Found 9970 unique tokens.


In [7]:
le = LabelEncoder()
y = le.fit_transform(df['airline_sentiment'])
y = to_categorical(np.asarray(y))

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.1, random_state = 40)

In [9]:
#lstm model
model1 = Sequential()
model1.add(layers.Embedding(max_words, 21))
model1.add(layers.LSTM(15,dropout=0.5))
model1.add(layers.Dense(2,activation='softmax'))
model1.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])
#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint1 = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model1.fit(X_train, y_train, epochs=20,validation_data=(X_test, y_test),callbacks=[checkpoint1])

Epoch 1/20

Epoch 00001: val_accuracy improved from -inf to 0.87532, saving model to best_model1.hdf5
Epoch 2/20

Epoch 00002: val_accuracy improved from 0.87532 to 0.89697, saving model to best_model1.hdf5
Epoch 3/20

Epoch 00003: val_accuracy did not improve from 0.89697
Epoch 4/20

Epoch 00004: val_accuracy improved from 0.89697 to 0.91169, saving model to best_model1.hdf5
Epoch 5/20

Epoch 00005: val_accuracy did not improve from 0.91169
Epoch 6/20

Epoch 00006: val_accuracy did not improve from 0.91169
Epoch 7/20

Epoch 00007: val_accuracy improved from 0.91169 to 0.91515, saving model to best_model1.hdf5
Epoch 8/20

Epoch 00008: val_accuracy did not improve from 0.91515
Epoch 9/20

Epoch 00009: val_accuracy did not improve from 0.91515
Epoch 10/20

Epoch 00010: val_accuracy did not improve from 0.91515
Epoch 11/20

Epoch 00011: val_accuracy did not improve from 0.91515
Epoch 12/20

Epoch 00012: val_accuracy did not improve from 0.91515
Epoch 13/20

Epoch 00013: val_accuracy did n

In [10]:
# bidirectional lstm model
model2 = Sequential()
model2.add(layers.Embedding(max_words, 40, input_length=max_len))
model2.add(layers.Bidirectional(layers.LSTM(20,dropout=0.6)))
model2.add(layers.Dense(2,activation='softmax'))
model2.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])
#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint2 = ModelCheckpoint("best_model2.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model2.fit(X_train, y_train, epochs=20,validation_data=(X_test, y_test),callbacks=[checkpoint2])

Epoch 1/20

Epoch 00001: val_accuracy improved from -inf to 0.89524, saving model to best_model2.hdf5
Epoch 2/20

Epoch 00002: val_accuracy improved from 0.89524 to 0.90563, saving model to best_model2.hdf5
Epoch 3/20

Epoch 00003: val_accuracy did not improve from 0.90563
Epoch 4/20

Epoch 00004: val_accuracy improved from 0.90563 to 0.91169, saving model to best_model2.hdf5
Epoch 5/20

Epoch 00005: val_accuracy improved from 0.91169 to 0.91515, saving model to best_model2.hdf5
Epoch 6/20

Epoch 00006: val_accuracy did not improve from 0.91515
Epoch 7/20

Epoch 00007: val_accuracy did not improve from 0.91515
Epoch 8/20

Epoch 00008: val_accuracy did not improve from 0.91515
Epoch 9/20

Epoch 00009: val_accuracy improved from 0.91515 to 0.92035, saving model to best_model2.hdf5
Epoch 10/20

Epoch 00010: val_accuracy did not improve from 0.92035
Epoch 11/20

Epoch 00011: val_accuracy did not improve from 0.92035
Epoch 12/20

Epoch 00012: val_accuracy did not improve from 0.92035
Epoch 

In [11]:
best_model = keras.models.load_model("best_model2.hdf5")

In [12]:
test_loss, test_acc = best_model.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)

37/37 - 1s - loss: 0.2186 - accuracy: 0.9203
Model accuracy:  0.9203463196754456


In [13]:
predictions = best_model.predict(X_test)

In [14]:
sentiment = ['Negative','Positive']

In [15]:
#testing different examples
sequence = tokenizer.texts_to_sequences(['this experience has been the worst , want my money back'])
test = pad_sequences(sequence, maxlen=max_len)
sentiment[np.around(best_model.predict(test), decimals=0).argmax(axis=1)[0]]

'Negative'

In [16]:
sequence = tokenizer.texts_to_sequences(['this article is the best ever'])
test = pad_sequences(sequence, maxlen=max_len)
sentiment[np.around(best_model.predict(test), decimals=0).argmax(axis=1)[0]]

'Positive'

In [17]:
sequence = tokenizer.texts_to_sequences(['i really loved how the technician helped me with the issue that i had'])
test = pad_sequences(sequence, maxlen=max_len)
sentiment[np.around(best_model.predict(test), decimals=0).argmax(axis=1)[0]]

'Positive'