### This script builds a LSTM model to perform sentiment analysis in the area of US airline service using Twitter.

### Input: "train.csv" and "test.csv" generated by "airline.py". The reason of doing this is using same train and test datasets with other models. Both of them include Tweets and the user's attitude: negative, neutral or positive.

### Output: predictions

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import re
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.utils import resample
from sklearn.utils import shuffle

np.random.seed(101)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Using TensorFlow backend.


## data pre-processing

In [0]:
# Lemmatization based on words' POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
 # use Wordnet(lexical database) to lemmatize text 
def lemmatize_text(text):
    
    lmtzr = WordNetLemmatizer().lemmatize
    text = word_tokenize(str(text))   # Init the Wordnet Lemmatizer    
    word_pos = pos_tag(text)    
    lemm_words = [lmtzr(sw[0], get_wordnet_pos(sw[1])) for sw in word_pos]
    return (' '.join(lemm_words))

# clean and normalize text
def pre_process(text):    
    
    emoji_pattern = re.compile("["
                       u"\U0001F600-\U0001F64F"  # emoticons
                       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                       u"\U0001F680-\U0001F6FF"  # transport & map symbols
                       u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                       u"\U00002702-\U000027B0"
                       u"\U000024C2-\U0001F251"
                       "]+", flags=re.UNICODE)    

    text = emoji_pattern.sub(r'', text)                       # remove emojis       
    text = text.lower()                                       # lowercase all letters   
#    text = re.sub(r'@[A-Za-z0-9]+', '', text)                # remove user mentions, e.g. @VirginAmerica    
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)       # remove URL links 

#    white_list = ["not", "no", "won't", "isn't", "couldn't", "wasn't", "didn't", "shouldn't", 
#                  "hasn't", "wouldn't", "haven't", "weren't", "hadn't", "shan't", "doesn't",
#                  "mightn't", "mustn't", "needn't", "don't", "aren't", "won't"]
#    words = text.split()
#    text = ' '.join([t for t in words if (t not in stopwords.words('english') or t in white_list)])  # remove stopwords        

    text = ''.join([t for t in text if t not in string.punctuation])   # remove all punctuations       
    text = ''.join([t for t in text if not t.isdigit()])   # remove all numeric digits     
    text = re.sub("[^a-zA-Z0-9]", " ", text)   # letters only         
    text = lemmatize_text(text)   # use Wordnet(lexical database) to lemmatize text     
#    text = stemmer_text(text)   # stem text 
    return text

## import training and test datasets generated by "airline.py"

In [4]:
train_X = pd.read_csv("/content/drive/My Drive/train.csv")
train_X['processed_text'] =  train_X['text'].apply(pre_process)  
train_X['label_convert'] = train_X['airline_sentiment'].map({'negative':0, 'neutral':1, 'positive': 2})
train_Y = np.array(train_X['label_convert'])

test_X = pd.read_csv("/content/drive/My Drive/test.csv")
test_X['processed_text'] =  test_X['text'].apply(pre_process)  
test_X['label_convert'] = test_X['airline_sentiment'].map({'negative':0, 'neutral':1, 'positive': 2})
test_Y = np.array(test_X['label_convert'])

#Y = pd.get_dummies(data['airline_sentiment']) #.values
#data['label_convert'] = data['airline_sentiment'].map({'negative':0, 'neutral':1, 'positive': 2})
#Y = np.array(data['label_convert'])

#train_X, test_X, train_Y, test_Y = train_test_split(data, Y, test_size = 0.2, random_state=42)

print(train_X.shape,train_Y.shape)
print(test_X.shape,test_Y.shape)


# No oversampling and undersampling
trainX_sampled = train_X
print(trainX_sampled.shape)

train_X[['text', 'processed_text']].head(10)

(11712, 7) (11712,)
(2928, 7) (2928,)
(11712, 7)


Unnamed: 0,text,processed_text
0,@united Hi. My relative's Flight Booking Prob...,united hi my relative flight book problem numb...
1,@AmericanAir served the nastiest food Ive ever...,americanair serve the nasty food ive ever see ...
2,"@united yes it is partly used, the del-ewr is ...",united yes it be partly use the delewr be use ...
3,@JetBlue Not helping since there's a bunch of ...,jetblue not help since there a bunch of u try ...
4,@united Will never fly with you again! Terribl...,unite will never fly with you again terrible s...
5,@USAirways @AmericanAir 2hrs Late Flightr fina...,usairways americanair hrs late flightr finally...
6,@JetBlue THANK YOU! I am your new big fan :),jetblue thank you i be your new big fan
7,@SouthwestAir Seriously? FOUR DELAYS? Only tak...,southwestair seriously four delay only take mi...
8,@AmericanAir still waiting for a flight... I s...,americanair still wait for a flight i should g...
9,@united we are sitting on the runway for 2 hou...,unite we be sit on the runway for hour it be r...


# undersample majority class

In [0]:
#df_major_neg = train_X[train_X['label'] == -1]
#df_minor_neu = train_X[train_X['label'] == 0]
#df_minor_pos = train_X[train_X['label'] == 1]        
#minor_count = len(df_minor_pos)

#df_major_neg_undersampled = resample(df_major_neg, 
#                              replace = True,              # sample with replacement
#                              n_samples = minor_count,     # to match minority class
#                              random_state = 1000)    

#df_minor_neu_undersampled = resample(df_minor_neu, 
#                              replace = True,             
#                              n_samples = minor_count,   
#                              random_state = 1000)      
      
#train_sampled = pd.concat([df_major_neg_undersampled, df_minor_neu_undersampled, df_minor_pos])   # Combine majority class with oversampled minority class
#print("Train dataset calss distribution: \n", train_sampled.label.value_counts())
#train_sampled = shuffle(train_sampled, random_state = 200) 
#print(trainX_sampled.shape)

# oversample minority class

In [0]:
#df_minor_neu_oversampled = resample(df_minor_neu, 
#                              replace = True,              # sample with replacement
#                              n_samples = major_count,     # to match majority class 
#                              random_state = 1000)    

#df_minor_pos_oversampled = resample(df_minor_pos, 
#                              replace = True,             
#                              n_samples = major_count,   
#                              random_state = 1000)      
      
#train_sampled = pd.concat([df_major_neg, df_minor_neu_oversampled, df_minor_pos_oversampled])   # Combine majority class with oversampled minority class
#print("Train dataset calss distribution: \n", train_sampled.label.value_counts())
#train_sampled = shuffle(train_sampled, random_state = 200) 

#print(trainX_sampled.shape)

## creates the vocabulary index based on word frequency, takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary. 

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(trainX_sampled['processed_text'])

trainX = tokenizer.texts_to_sequences(trainX_sampled['processed_text'].values)
testX = tokenizer.texts_to_sequences(test_X['processed_text'].values)

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)  


10347


## text padding

In [8]:
max_len = 35
trainX_pad = pad_sequences(trainX, maxlen = max_len)
testX_pad = pad_sequences(testX, maxlen = max_len)
print(trainX_pad)
print(testX_pad)

[[   0    0    0 ... 4379   63    7]
 [   0    0    0 ... 4382   33  461]
 [   0    0    0 ... 1815   16 1256]
 ...
 [   0    0    0 ...   30  173  741]
 [   0    0    0 ...   16    3  437]
 [   0    0    0 ...    0   14   38]]
[[   0    0    0 ...  131   17  764]
 [   0    0    0 ...  357    8  151]
 [   0    0    0 ...   70   17    1]
 ...
 [   0    0    0 ...    8  110  266]
 [   0    0    0 ...   76   30  108]
 [   0    0    0 ...  110  144 5223]]


## build LSTM model

In [9]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras import metrics
from keras import regularizers

embed_dim = 16
lstm_out = 8
def buildModel():     
    model = Sequential()
    model.add(Embedding(vocab_size, embed_dim, input_length=max_len))
    model.add(LSTM(lstm_out, dropout = 0.5))
    model.add(Dense(3, activation='softmax')) #, kernel_regularizer=regularizers.l2(0.005)))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[metrics.sparse_categorical_accuracy])
    #model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
model = buildModel()
print(model.summary())





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 35, 16)            165552    
_________________________________________________________________
lstm_1 (LSTM)                (None, 8)                 800       
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 27        
Total params: 166,379
Trainable params: 166,379
Non-trainable params: 0
_________________________________________________________________
None


## moded fitting on train data

In [10]:
model.fit(trainX_pad,trainX_sampled['label_convert'], epochs = 8, batch_size = 48, verbose = 2)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/8





 - 8s - loss: 0.8278 - sparse_categorical_accuracy: 0.6533
Epoch 2/8
 - 7s - loss: 0.6201 - sparse_categorical_accuracy: 0.7392
Epoch 3/8
 - 7s - loss: 0.5280 - sparse_categorical_accuracy: 0.7964
Epoch 4/8
 - 7s - loss: 0.4357 - sparse_categorical_accuracy: 0.8415
Epoch 5/8
 - 7s - loss: 0.3708 - sparse_categorical_accuracy: 0.8642
Epoch 6/8
 - 7s - loss: 0.3222 - sparse_categorical_accuracy: 0.8866
Epoch 7/8
 - 7s - loss: 0.2889 - sparse_categorical_accuracy: 0.8973
Epoch 8/8
 - 7s - loss: 0.2643 - sparse_categorical_accuracy: 0.9054


<keras.callbacks.History at 0x7ff24c85bfd0>

## get model score and overall accuracy

In [11]:
score, acc = model.evaluate(testX_pad, test_Y, verbose = 2, batch_size = 48)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.59
acc: 0.80


## calculate precision, recall and f1-score

In [12]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report
test_Y_pred1 = model.predict(testX_pad)
test_Y_pred = np.argmax(test_Y_pred1, axis=1)
print(classification_report(test_Y, test_Y_pred))

              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1817
           1       0.63      0.59      0.61       628
           2       0.79      0.68      0.73       483

    accuracy                           0.80      2928
   macro avg       0.76      0.73      0.74      2928
weighted avg       0.80      0.80      0.80      2928



## calculate confusion matrix

In [13]:
print(pd.crosstab(test_Y.ravel(), test_Y_pred, rownames = ['True'], colnames = ['Predicted'], margins = True))

Predicted     0    1    2   All
True                           
0          1641  146   30  1817
1           197  371   60   628
2            80   74  329   483
All        1918  591  419  2928


## Store predictions

In [0]:
df_result = test_X.copy()
df_result['prediction'] = test_Y_pred.tolist() 

#file_name = 'LSTM_prediction'
df_result.to_csv("/content/drive/My Drive/LSTM_prediction.csv") # + file_name + '.csv')    