In [None]:
!pip install sentence_transformers

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import os
import torch
import tensorflow as tf 

SEED = 99

def random_seed(SEED):
    
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    tf.random.set_seed(SEED)

random_seed(SEED)

In [2]:
Train = pd.read_csv('../input/sentiment-analysis/train.csv')
Test = pd.read_csv('../input/sentiment-analysis/test.csv')

In [3]:
Train.pop('ID');
Test.pop('ID');
Train.drop_duplicates(inplace=True)

In [4]:
Train.reset_index(inplace=True)
Train.pop('index');

In [5]:
Train.isnull().sum()

author       0
Review       0
Sentiment    0
dtype: int64

In [6]:
Test.isnull().sum()

author    0
Review    0
dtype: int64

In [12]:
Train['Sentiment'].value_counts()

0    19298
2    18728
1     6068
Name: Sentiment, dtype: int64

In [None]:
Train['Review'].str.split('').apply(len).describe()

In [None]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        t = 'www' if t.startswith('www') else t
        new_text.append(t)
    return " ".join(new_text)


Train['Review'] = Train['Review'].apply(preprocess)
Test['Review'] = Test['Review'].apply(preprocess)

In [None]:
Train['Review'] = Train['Review'].str.replace('\W', ' ')
Test['Review']  = Test['Review'].str.replace('\W', ' ')

In [None]:
# This section need to be uncommented when we need to extract Uncased Embedding 
# for the two models #cardiffnlp/twitter-roberta-base-sentiment & paraphrase-mpnet-base-v2

#Train['Review'] = Train['Review'].str.lower()
#Test['Review']  = Test['Review'].str.lower()

In [None]:
import nltk

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

Train['Review'] = Train.Review.apply(lemmatize_text)
Test['Review'] = Test.Review.apply(lemmatize_text)

In [None]:
Train['Review'].str.split('').apply(len).describe()

In [None]:
from sentence_transformers import SentenceTransformer
#Below models used to extract and train 
#cardiffnlp/twitter-roberta-base-sentiment
#paraphrase-mpnet-base-v2'

model = SentenceTransformer('cardiffnlp/twitter-roberta-base-sentiment')
model.max_seq_length = 512
print("Max Sequence Length:", model.max_seq_length)
sentence_embeddings = model.encode(Train['Review'])

In [None]:
sentence_embeddings_test = model.encode(Test['Review'])

In [None]:
Y = Train['Sentiment']

In [None]:
# Building a simple neural network model 

In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers
from keras import Model

In [None]:
def wider_model():
    model = keras.Sequential()
    model.add(layers.Dense(768,input_shape=(768,),activation=tf.keras.activations.swish))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(384,activation=tf.keras.activations.swish))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(3,activation=tf.keras.activations.softmax))
    return model

In [None]:
from sklearn.model_selection import StratifiedKFold,KFold
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from sklearn.metrics import log_loss

skf = StratifiedKFold(n_splits=30,shuffle=True,random_state=99)

Full_Train_pred = []
Final_Subbmission = []
val_loss_print = []

for train_index, test_index in skf.split(sentence_embeddings,Y):
    X_train, X_test = sentence_embeddings[train_index], sentence_embeddings[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

    y_train = tf.one_hot(y_train,depth=3)
    y_test = tf.one_hot(y_test,depth=3)
   
    model = wider_model()
    
    val_ds = (X_test,y_test)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1,patience=1,min_lr=0.0000001,verbose=1)
    early_stoping = EarlyStopping(monitor="val_loss",min_delta=0,patience=3,verbose=1,mode="auto",
    baseline=None,restore_best_weights=True)
    model.compile(loss='categorical_crossentropy',metrics='categorical_crossentropy', optimizer='Adam')
    histroy = model.fit(X_train,y_train, validation_data=val_ds,epochs=69,callbacks=[reduce_lr,early_stoping],verbose=1)
    
    print(min(histroy.history['val_loss']))
    val_loss_print.append(min(histroy.history['val_loss']))
    
    Train_seq_pred = model.predict_proba(sentence_embeddings)
    Test_seq_pred = model.predict_proba(sentence_embeddings_test)
    
    Full_Train_pred.append(Train_seq_pred)
    
    Final_Subbmission.append(Test_seq_pred)
    

In [None]:
np.mean(val_loss_print)

In [None]:
Train_prob =np.mean(Full_Train_pred,0)
Test_prob =np.mean(Final_Subbmission,0)

In [None]:
#Four CSV files were extracted from this code : 
#And explained above it was for uncased and cased dataset with two models #cardiffnlp/twitter-roberta-base-sentiment & paraphrase-mpnet-base-v2
#Which were then finally blended to give a final output 

#Train_prob_Rb_cased.csv
#Train_prob_mpnet_cased.csv
#Train_prob_RB_uncased.csv
#Train_prob_MPNET_uncased.csv

#Test_prob_Rb_cased.csv
#Test_prob_mpnet_cased.csv
#Test_prob_RB_uncased.csv
#Test_prob_MPNET_uncased.csv

In [None]:
Train_prob.to_csv('Train_prob_MPNET_uncased.csv',index=False)
Test_prob.to_csv('Test_prob_MPNET_uncased.csv',index=False)