In [44]:
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import seaborn as sns
import numpy as np
import nltk
nltk.download('wordnet')
import re
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package wordnet to /Users/a0j01no/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Reading dataset and applying preprocessing such as:
* Cleaning by removing links, special characters, etc
* Removing Stop words
* Stemming or Lemmatization
* Tokenizing, that is converting from text to sequence

In [5]:
tweets = pd.read_csv("Tweets-1.csv")

In [6]:
X = list(tweets.text.values)
Y = list(tweets.airline_sentiment.values)

In [7]:
#Checking if all are marked sentiments
len(X) == len(Y)

True

In [8]:
def preprocess(document, stem=False):
    document = document.lower()
    
    words = word_tokenize(document)
    
    words = [word for word in words if word not in stopwords.words("english")]
    
    if stem:
        words = [PorterStemmer().stem(word) for word in words]
    else:
        words = [WordNetLemmatizer().lemmatize(word, pos='v') for word in words]
    
    document = " ".join(words)
    
    return document

In [9]:
all_words = []
message_set = []
for s,l in zip(X,Y):
    #print(s)
    temp = s
    temp = re.sub(r'@[A-Za-z]+', '', temp)                                # removing words with @ signs
    temp = re.sub(r'[^\x00-\x7F]+', '', re.sub(r"http\S+", "", temp))     # removing emoji's
    temp = re.sub(r'[^\w\s]', '', temp)                                   # removing punctuations
    temp = re.sub(r'[0-9]+', '', temp)                                    # removing numbers
    filterd_words = [word for word in preprocess(temp).split() if len(word) > 3]
    if len(filterd_words) == 0:                             # removing the messages which have no words in it
        continue
    message_set.append((filterd_words, l))
    all_words.extend(filterd_words)

In [10]:
def get_word_features(all_words):
    wordlist = nltk.FreqDist(all_words)
    word_features = wordlist.keys()
    return word_features

In [11]:
len(get_word_features(all_words))

9664

In [12]:
idx_80 = int(len(message_set)*0.8)

In [13]:
training_messages = message_set[:idx_80]
test_messages = message_set[idx_80:]

In [14]:
word_features = get_word_features(all_words)

In [18]:
training_set = nltk.apply_features(extract_features, training_messages)
testing_set = nltk.apply_features(extract_features, test_messages)

In [38]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

In [17]:
tokenizer = Tokenizer(num_words=2000)

In [16]:
training_messages[0]

(['plus', 'youve', 'commercials', 'experience', 'tacky'], 'positive')

In [27]:
tokenizer.fit_on_texts([" ".join(words) for (words, sentiment) in training_messages])

In [28]:
X = tokenizer.texts_to_sequences([" ".join(words) for (words, sentiment) in training_messages])

In [30]:
X = pad_sequences(X)
Y = pd.get_dummies([sentiment for (words, sentiment) in training_messages])

In [37]:
len(X) == len(Y)
X.shape[1]

16

### Creating a LSTM model with some dropouts and training

In [125]:
model = Sequential()
model.add(Embedding(2000, 128, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(196, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 16, 128)           256000    
_________________________________________________________________
spatial_dropout1d_9 (Spatial (None, 16, 128)           0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_7 (Dense)              (None, 3)                 591       
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________
None


In [126]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

In [127]:
model.fit(X_train, Y_train, epochs=10, batch_size=8, verbose=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x160971d90>

In [128]:
pred = model.predict(X_test)

In [129]:
Y_test.shape

(2326, 3)

In [130]:
pred.shape

(2326, 3)

### Let's do comparison of the prediction and actual result

In [131]:
res = 0
for x in range(0, pred.shape[0]):
    temp = Y_test.reset_index(drop=True).iloc[x, :].values
    #print(temp)
    if list(pred[x]).index(np.max(pred[x])) == list(temp).index(np.max(temp)):
        res = res + 1

In [132]:
res/pred.shape[0]

0.7510748065348237

### Training accuracy was 93% while Test accuracy is 75% seems like overfitting