### Custom Neural Network with Sentence-T5 based Transformer for text Encoding

In [2]:
import numpy as np 
import pandas as pd 
import os
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv
/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip
/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip


### Loadind the data

In [3]:
train = pd.read_table('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip')

## Applying Sentence-T5: Scalable sentence encoders from pre-trained text-to-text models 

- This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space. The model works well for sentence similarity tasks.
- Outperforms Sentence-BERT.

In [4]:
!pip install -q sentence-transformers

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.3.0 requires transformers<4.6,>=4.1, but you have transformers 4.25.1 which is incompatible.[0m


In [5]:
from sentence_transformers import SentenceTransformer
sbert = SentenceTransformer('sentence-transformers/sentence-t5-base')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/115 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/74.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/198 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

In [6]:
encoded_phrases = sbert.encode(train['Phrase'])
print(encoded_phrases.shape)

Batches:   0%|          | 0/4877 [00:00<?, ?it/s]

(156060, 768)


### Splitting the train data into train and test

In [7]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(encoded_phrases, train['Sentiment'], test_size=0.2, random_state=42)

print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)

(124848, 768) (124848,)
(31212, 768) (31212,)


### Defining evaluation metrics

In [8]:
def evaluation_metrics(y_test, y_pred):
    result = {}
    result['Accuracy'] = accuracy_score(y_test, y_pred)
    result['Precision_weighted'] = precision_score(y_test, y_pred, average='weighted')
    result['Recall_weighted'] = recall_score(y_test, y_pred, average='weighted')
    result['F1 Score_weighted'] = f1_score(y_test, y_pred, average='weighted')
    result['Precision_micro'] = precision_score(y_test, y_pred, average='micro')
    result['Recall_micro'] = recall_score(y_test, y_pred, average='micro')
    result['F1 Score_micro'] = f1_score(y_test, y_pred, average='micro')
    return result

### 1. Sequential NN

In [10]:
from keras.models import Sequential
from keras.layers import *

nn_model = Sequential([
    Dense(512, input_shape=(768,), activation='relu'),
    Dropout(0.2),
    Dense(256, activation='relu'),
    Dense(256, activation='relu'),
    Dense(256, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(5, activation='softmax'),
])

nn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='acc')

nn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               393728    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                1

In [11]:
history = nn_model.fit(train_x, train_y, epochs=10, batch_size=64, validation_data=(test_x, test_y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
nn_pred = nn_model.predict(test_x)
nn_pred = pd.DataFrame(nn_pred)
nn_pred['label'] = nn_pred.idxmax(axis=1)
res = evaluation_metrics(test_y,nn_pred['label'])
res

{'Accuracy': 0.6861143150070486,
 'Precision_weighted': 0.6794052926723396,
 'Recall_weighted': 0.6861143150070486,
 'F1 Score_weighted': 0.6754230510617323,
 'Precision_micro': 0.6861143150070486,
 'Recall_micro': 0.6861143150070486,
 'F1 Score_micro': 0.6861143150070486}

In [13]:
report = classification_report(test_y,nn_pred['label'])
print(report)

              precision    recall  f1-score   support

           0       0.65      0.26      0.37      1416
           1       0.58      0.60      0.59      5527
           2       0.74      0.85      0.79     15639
           3       0.64      0.54      0.59      6707
           4       0.63      0.45      0.52      1923

    accuracy                           0.69     31212
   macro avg       0.65      0.54      0.57     31212
weighted avg       0.68      0.69      0.68     31212



In [14]:
history = nn_model.fit(train_x, train_y, epochs=10, batch_size=32, validation_data=(test_x, test_y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
nn_pred = nn_model.predict(test_x)
nn_pred = pd.DataFrame(nn_pred)
nn_pred['label'] = nn_pred.idxmax(axis=1)
res = evaluation_metrics(test_y,nn_pred['label'])
res

{'Accuracy': 0.6804754581571191,
 'Precision_weighted': 0.6854719515846176,
 'Recall_weighted': 0.6804754581571191,
 'F1 Score_weighted': 0.6757590383417545,
 'Precision_micro': 0.6804754581571191,
 'Recall_micro': 0.6804754581571191,
 'F1 Score_micro': 0.6804754581571191}

In [16]:
report = classification_report(test_y,nn_pred['label'])
print(report)

              precision    recall  f1-score   support

           0       0.66      0.23      0.35      1416
           1       0.54      0.71      0.61      5527
           2       0.78      0.78      0.78     15639
           3       0.61      0.59      0.60      6707
           4       0.60      0.46      0.52      1923

    accuracy                           0.68     31212
   macro avg       0.64      0.55      0.57     31212
weighted avg       0.69      0.68      0.68     31212



### 2. LGBMClassifier

In [17]:
from lightgbm import LGBMClassifier, Booster
lgbm_model = LGBMClassifier(objective='multiclass')
lgbm_model.fit(train_x, train_y)
y_preds = lgbm_model.predict_proba(test_x)

y_preds = pd.DataFrame(y_preds)
y_preds['label'] = y_preds.idxmax(axis=1)
evaluation_metrics(test_y,y_preds['label'])

{'Accuracy': 0.6670831731385365,
 'Precision_weighted': 0.6566924084618412,
 'Recall_weighted': 0.6670831731385365,
 'F1 Score_weighted': 0.6558949105218476,
 'Precision_micro': 0.6670831731385365,
 'Recall_micro': 0.6670831731385365,
 'F1 Score_micro': 0.6670831731385365}

In [18]:
report = classification_report(test_y,y_preds['label'])
print(report)

              precision    recall  f1-score   support

           0       0.58      0.31      0.41      1416
           1       0.57      0.53      0.55      5527
           2       0.73      0.84      0.78     15639
           3       0.60      0.54      0.57      6707
           4       0.61      0.38      0.46      1923

    accuracy                           0.67     31212
   macro avg       0.62      0.52      0.55     31212
weighted avg       0.66      0.67      0.66     31212



### 3. RandomForest Classifer

In [19]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(train_x, train_y)
y_pred = model.predict(test_x)
res = evaluation_metrics(test_y, y_pred)
res

{'Accuracy': 0.6517044726387287,
 'Precision_weighted': 0.6380418277134692,
 'Recall_weighted': 0.6517044726387287,
 'F1 Score_weighted': 0.6275035680890004,
 'Precision_micro': 0.6517044726387287,
 'Recall_micro': 0.6517044726387287,
 'F1 Score_micro': 0.6517044726387287}

In [21]:
report = classification_report(test_y, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.56      0.25      0.34      1416
           1       0.58      0.45      0.51      5527
           2       0.68      0.89      0.77     15639
           3       0.61      0.45      0.52      6707
           4       0.60      0.28      0.38      1923

    accuracy                           0.65     31212
   macro avg       0.61      0.46      0.50     31212
weighted avg       0.64      0.65      0.63     31212



### 4. Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=0).fit(train_x, train_y)
y_pred = model.predict(test_x)
res = evaluation_metrics(test_y, y_pred)
res

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


{'Accuracy': 0.6653530693323081,
 'Precision_weighted': 0.6551402517100977,
 'Recall_weighted': 0.6653530693323081,
 'F1 Score_weighted': 0.6510741442045601,
 'Precision_micro': 0.6653530693323081,
 'Recall_micro': 0.6653530693323081,
 'F1 Score_micro': 0.6653530693323081}

In [24]:
report = classification_report(test_y, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.60      0.24      0.34      1416
           1       0.56      0.52      0.54      5527
           2       0.72      0.85      0.78     15639
           3       0.60      0.54      0.57      6707
           4       0.64      0.36      0.46      1923

    accuracy                           0.67     31212
   macro avg       0.62      0.50      0.54     31212
weighted avg       0.66      0.67      0.65     31212



### 5. GaussianNB

In [25]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(train_x, train_y)
y_pred = clf.predict(test_x)

res = evaluation_metrics(test_y, y_pred)
res


{'Accuracy': 0.5674740484429066,
 'Precision_weighted': 0.6190060638312518,
 'Recall_weighted': 0.5674740484429066,
 'F1 Score_weighted': 0.5814310291069439,
 'Precision_micro': 0.5674740484429066,
 'Recall_micro': 0.5674740484429066,
 'F1 Score_micro': 0.5674740484429066}

In [26]:
report = classification_report(test_y, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.30      0.66      0.41      1416
           1       0.45      0.46      0.46      5527
           2       0.80      0.64      0.71     15639
           3       0.48      0.43      0.45      6707
           4       0.34      0.70      0.46      1923

    accuracy                           0.57     31212
   macro avg       0.47      0.58      0.50     31212
weighted avg       0.62      0.57      0.58     31212



### 4. Bidirectional LSTM

In [22]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Activation
from keras.layers import Embedding
from keras.layers import Bidirectional

model = Sequential()
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
model.add(inputs)
model.add(Embedding(50000, 128))
# Add 2 bidirectional LSTMs
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
# Add a classifier
model.add(Dense(5, activation="sigmoid"))
#model = keras.Model(inputs, outputs)
model.summary()

model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit(train_x, train_y, batch_size=32, epochs=5, validation_data=(test_x, test_y))



Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 128)         6400000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 128)         98816     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_7 (Dense)              (None, 5)                 645       
Total params: 6,598,277
Trainable params: 6,598,277
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f281c1b0490>

In [24]:
pred=model.predict(test_x)
pred = pd.DataFrame(pred)
pred['label'] = pred.idxmax(axis=1)
res = evaluation_metrics(test_y,pred['label'])
res

  _warn_prf(average, modifier, msg_start, len(result))


{'Accuracy': 0.5010572856593618,
 'Precision_weighted': 0.2510584035123273,
 'Recall_weighted': 0.5010572856593618,
 'F1 Score_weighted': 0.33450875714186507,
 'Precision_micro': 0.5010572856593618,
 'Recall_micro': 0.5010572856593618,
 'F1 Score_micro': 0.5010572856593618}

In [25]:
report = classification_report(test_y,pred['label'])
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1416
           1       0.00      0.00      0.00      5527
           2       0.50      1.00      0.67     15639
           3       0.00      0.00      0.00      6707
           4       0.00      0.00      0.00      1923

    accuracy                           0.50     31212
   macro avg       0.10      0.20      0.13     31212
weighted avg       0.25      0.50      0.33     31212



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
