<a href="https://colab.research.google.com/github/aithaprasad/NLP_Sentiment_Analysis/blob/master/Sentimental_Analysis_any_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

**Reading the data from the tsv file and naming the columns**

In [2]:
import pandas as pd
import numpy as np
data = pd.read_csv('train.tsv', delimiter="\t", header = None, names = ['label', 'sentence'])
test_data = pd.read_csv('train.tsv', delimiter="\t", header = None, names = ['label', 'sentence'])

In [3]:
data.head() #just making sure data got read and labelled correctly

Unnamed: 0,label,sentence
0,0,@USER @USER a sicrhau bod mwy o arian poced 'd...
1,1,Parti Dolig da gyda tim swyddfa canolog @USER ...
2,0,@USER yeaah ma fe yn wir. ( oh well.
3,1,@USER hahaha idk. 3am oedd y bws ti?
4,0,@USER dwim yn gal llun ohoni?


In [4]:
data.isnull().values.any()

False

In [5]:
data.shape

(78609, 2)

In [4]:
X, y = data['sentence'], data['label']

In [5]:
X_final_test, y_final_test = test_data['sentence'], test_data['label']

**Splitting the dataset into train, dev and test samples**

In [5]:
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.20, random_state=42)

In [6]:
X_dev, X_test, y_dev, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=42)

**Tokenizing words and make it fit on our training data and utilized that tokenizer to compute dev and test sets.**

In [7]:
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(X_train)

X_train_tok = tokenizer.texts_to_sequences(X_train)
X_dev_tok = tokenizer.texts_to_sequences(X_dev)
X_test_tok = tokenizer.texts_to_sequences(X_test)

In [6]:
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(X)

X_tok = tokenizer.texts_to_sequences(X)
#X_dev_tok = tokenizer.texts_to_sequences(X_dev)
X_final_test_tok = tokenizer.texts_to_sequences(X_final_test)

In [10]:
X_train_tok[60000]

[100, 60, 2777, 27, 2, 791, 2764, 5, 317]

In [12]:
for item in X_train_tok:
  if 49999 in item:
    print("True")

True


In [13]:
len(tokenizer.word_index)

61245

**Currently, we have different length of sentences(w.r.t number of words), so we are gonna pad all those sentences with less than 100 words**

In [8]:
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train_pad = pad_sequences(X_train_tok, padding='post', maxlen=maxlen)
X_dev_pad = pad_sequences(X_dev_tok, padding='post', maxlen=maxlen)
X_test_pad = pad_sequences(X_test_tok, padding='post', maxlen=maxlen)

In [7]:
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_pad = pad_sequences(X_tok, padding='post', maxlen=maxlen)
#X_dev_pad = pad_sequences(X_dev_tok, padding='post', maxlen=maxlen)
X_final_test_pad = pad_sequences(X_final_test_tok, padding='post', maxlen=maxlen)

**Started with simple neural network which consists of an embedding layer and the dense layer with sigmoid activation and also added a dropout layer because of the overfitting**

In [8]:
model = Sequential()
embedding_layer = Embedding(vocab_size, 100, input_length=maxlen)
model.add(embedding_layer)
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          7029700   
                                                                 
 dropout (Dropout)           (None, 100, 100)          0         
                                                                 
 flatten (Flatten)           (None, 10000)             0         
                                                                 
 dense (Dense)               (None, 1)                 10001     
                                                                 
Total params: 7,039,701
Trainable params: 7,039,701
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
history = model.fit(X_pad, y, batch_size=128, epochs=3, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [10]:
history = model.fit(X_train_pad, y_train, batch_size=128, epochs=3, verbose=1, validation_data=(X_dev_pad, y_dev))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [10]:
score = model.evaluate(X_final_test_pad, y_final_test, verbose=1)



In [11]:
score = model.evaluate(X_test_pad, y_test, verbose=1)



In [11]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.18391725420951843
Test Accuracy: 0.9465837478637695


In [15]:
print(score)

[0.18391725420951843, 0.9465837478637695]


In [18]:
from sklearn import metrics
import numpy as np

y_final_test_predicted_labels = model.predict(X_final_test_pad)
y_final_test_predicted_labels = (y_final_test_predicted_labels > 0.5).astype(int)

final_test_accuracy = metrics.accuracy_score(y_final_test, y_final_test_predicted_labels)
final_test_precision = metrics.precision_score(y_final_test, y_final_test_predicted_labels, pos_label = 1)
final_test_recall = metrics.recall_score(y_final_test, y_final_test_predicted_labels, pos_label = 1)
final_test_f1score = metrics.f1_score(y_final_test, y_final_test_predicted_labels, pos_label = 1)
final_test_auc_score = metrics.roc_auc_score(y_final_test, y_final_test_predicted_labels)

In [20]:
print(final_test_accuracy)
print(final_test_precision)
print(final_test_recall)
print(final_test_f1score)
print(final_test_auc_score)

0.9465837245099161
0.9436502323701758
0.9499605888784357
0.9467948961619846
0.9465816182681385


In [14]:
model.save("neural_network_with_regularization.h5")

In [12]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.5574179291725159
Test Accuracy: 0.7323495745658875


**Tried with CNN model, added a convolutional 1D network and played trial and error with count in dense layers and number of neurons but ended uo with a Conv1D, and then regularization with dropout and a global max pooling layer**

In [18]:
from keras.layers import Conv1D
cnn_model = Sequential()

embedding_layer = Embedding(vocab_size, 100, input_length=maxlen)
cnn_model.add(embedding_layer)

cnn_model.add(Conv1D(128, 5, activation='relu'))
model.add(Dropout(0.2))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [19]:
cnn_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          6124600   
                                                                 
 conv1d_1 (Conv1D)           (None, 96, 128)           64128     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 6,188,857
Trainable params: 6,188,857
Non-trainable params: 0
_________________________________________________________________


In [20]:
cnn_history = cnn_model.fit(X_train_pad, y_train, batch_size=128, epochs=1, verbose=1, validation_data=(X_dev_pad, y_dev))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [21]:
score = cnn_model.evaluate(X_test_pad, y_test, verbose=1)



In [22]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 1.2805395126342773
Test Accuracy: 0.6972395181655884


**Used RNN with LSTM(Long short-term memory), with the same embedding layer and activation, but got under-fitting, tried by incresing the complexity but the results went down.**

In [15]:
from keras.layers import LSTM
lstm_model = Sequential()
embedding_layer = Embedding(vocab_size, 100, input_length = maxlen)
lstm_model.add(embedding_layer)
lstm_model.add(LSTM(128))
lstm_model.add(Dense(1, activation='sigmoid'))
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [16]:
lstm_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          6124600   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 6,241,977
Trainable params: 6,241,977
Non-trainable params: 0
_________________________________________________________________


In [17]:
lstm_history = lstm_model.fit(X_train_pad, y_train, batch_size=128, epochs=5, verbose=1, validation_data=(X_dev_pad, y_dev))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
score = lstm_model.evaluate(X_test_pad, y_test, verbose=1)



In [19]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.693200945854187
Test Accuracy: 0.49484798312187195


**Need to check with recall, precision and F-1 Score**

**Reference: https://stackabuse.com/python-for-nlp-word-embeddings-for-deep-learning-in-keras/** used for embedding layer and tokenization.