# Movie Review Sentiment Analysis using LSTM

## Project Steps:
> * #### Importing important depenedencies
> * #### EDA
> * #### Text Preprocessing
> * #### Model Building
> * #### Evaluation
> * #### Summary
---

### Importing important dependencies

In [None]:
# Main libraries
import numpy as np
import pandas as pd
pd.set_option('max_colwidth',400)
import matplotlib.pyplot as plt
import seaborn as sns
import datasets
%matplotlib inline
from bs4 import BeautifulSoup
import re,string,unicodedata

# ML libraries (Preprocessing, models..)
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

# NLTK 
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud

# CNN, LSTM and Embedding 
from keras import backend as K
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.preprocessing.text import Tokenizer
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification , AutoTokenizer
from keras.utils import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, CuDNNGRU, CuDNNLSTM, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, ZeroPadding1D, Add, Flatten, Dot, Concatenate, Lambda
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D, TimeDistributed, Attention
from keras.layers import InputSpec, Layer
from keras.models import Model, load_model
from keras.optimizers import Adam,Adagrad
from tensorflow_addons.optimizers import AdamW
from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping

## Text mining and EDA

In [None]:
# Selecting a subset of data to be faster in demonstration
train_df = pd.read_csv('/kaggle/input/shai-training-2023-a-level-2/Train.csv',encoding="utf-8")
valid_df = pd.read_csv('/kaggle/input/shai-training-2023-a-level-2/Valid.csv',encoding="utf-8")
test_df = pd.read_csv('/kaggle/input/shai-training-2023-a-level-2/Test.csv',encoding="utf-8")
sub = pd.read_csv('/kaggle/input/shai-training-2023-a-level-2/sample_submission.csv')
print('Train: '+ str(len(train_df)))
print('Valid: '+ str(len(valid_df)))
print('Test: '+ str(len(test_df)))
train_df["label"] = train_df.label.astype(float)
train_df.head(10)

In [None]:
# The distribution of sentiments
train_df.groupby('label').count().plot(kind='bar')

This means that the no. of positive reviews is equal to the no. of negative reviews in the dataset. This is a good thing since it means our dataset is not skewed.

In [None]:
# Calculate review lengths
review_len = pd.Series([len(review.split()) for review in train_df['text']])

# The distribution of review text lengths
review_len.plot(kind='box')

Now, let us visualize how long our sentences are in the training data

In [None]:
sns.set_theme(
    context='notebook',
    style='darkgrid',
    palette='deep',
    font='sans-serif',
    font_scale=1,
    color_codes=True,
    rc=None,
)

plt.figure(figsize = (10,12))
sns.histplot(review_len)

In [None]:
fig = plt.figure(figsize=(14,7))
train_df['length'] = train_df.text.str.split().apply(len)
ax1 = fig.add_subplot(122)
sns.histplot(train_df[train_df['label']==1]['length'], ax=ax1,color='green')
describe = train_df.length[train_df.label==1].describe().to_frame().round(2)

ax2 = fig.add_subplot(121)
ax2.axis('off')
font_size = 14
bbox = [0, 0, 1, 1]
table = ax2.table(cellText = describe.values, rowLabels = describe.index, bbox=bbox, colLabels=describe.columns)
table.set_fontsize(font_size)
fig.suptitle('Distribution of text length for positive sentiment reviews.', fontsize=16)

plt.show()

In [None]:
fig = plt.figure(figsize=(14,7))
ax1 = fig.add_subplot(122)
sns.histplot(train_df[train_df['label']==0]['length'], ax=ax1,color='red')
describe = train_df.length[train_df.label==0].describe().to_frame().round(2)

ax2 = fig.add_subplot(121)
ax2.axis('off')
font_size = 14
bbox = [0, 0, 1, 1]
table = ax2.table(cellText = describe.values, rowLabels = describe.index, bbox=bbox, colLabels=describe.columns)
table.set_fontsize(font_size)
fig.suptitle('Distribution of text length for Negative sentiment reviews.', fontsize=16)

plt.show()

### WORDCLOUD FOR NEGATIVE TEXT (LABEL - 0)

In [None]:
plt.figure(figsize = (20,20)) # Negative Review Text
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(train_df[train_df.label == 0].text))
plt.imshow(wc , interpolation = 'bilinear')

### WORDCLOUD FOR POSITIVE TEXT (LABEL - 1)


In [None]:
plt.figure(figsize = (20,20)) # Positive Review Text
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(train_df[train_df.label == 1].text))
plt.imshow(wc , interpolation = 'bilinear')

---
## Text Preprocessing

Text preprocessing is an essential step in natural language processing (NLP) and machine learning projects, including sentiment analysis. Here are some of the typical text preprocessing steps for a sentiment analysis NLP project:

* `Text Cleaning`: Remove unnecessary characters, such as punctuation, special characters, numbers, and stop words (common words that don't carry much meaning, such as "the," "a," "and," etc.)

* `Lowercasing`: Convert all the words to lowercase to treat different cases of the same word as the same (e.g., "good" and "Good").

* `Tokenization`: Split the text into individual words or phrases (tokens).

* `Stemming/Lemmatization`: Reduce words to their base or root form to normalize the text. Stemming removes the suffixes from words, while lemmatization reduces words to their base form based on their part of speech.

* `Parts of Speech Tagging`: Identify the part of speech of each word (noun, verb, adjective, etc.) to help determine the meaning and context of the sentence.

* `Sentiment lexicon-based feature extraction`: Assign a sentiment score to each token based on a sentiment lexicon (a dictionary of words and their sentiment polarity).

* `Feature Encoding`: Convert text data into numerical representations that machine learning algorithms can understand, such as one-hot encoding, TF-IDF, or word embedding.

* `Feature Selection`: Select the most relevant features to reduce the dimensionality of the data and improve the model's performance.

In [None]:
# Turning all text to lowercase
train_df['text'] = train_df['text'].str.lower()
valid_df['text'] = valid_df['text'].str.lower()
test_df['text'] = test_df['text'].str.lower()
train_df.head()

In [None]:
# Removing punctuation
exclude = set(string.punctuation) 

def remove_punctuation(x): 
    try: 
        x = ''.join(ch for ch in x if ch not in exclude) 
    except: 
        pass 
    return x 

train_df['text'] = train_df['text'].apply(remove_punctuation)
valid_df['text'] = valid_df['text'].apply(remove_punctuation)
test_df['text'] = test_df['text'].apply(remove_punctuation)
train_df.head()

In [None]:
# Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

# Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
# Apply function on review column
train_df['text'] = train_df['text'].apply(denoise_text)
valid_df['text'] = valid_df['text'].apply(denoise_text)
test_df['text'] = test_df['text'].apply(denoise_text)
train_df.head()

In [None]:
# set stopwords to english
stop=set(stopwords.words('english'))
stopword_list=nltk.corpus.stopwords.words('english')
print(stop)

# Create an instance of the TweetTokenizer class
# Tokenization of text
tokenizer=TweetTokenizer()


# removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
stop = stopwords.words('english')

train_df['text'] = train_df['text'].apply(remove_stopwords)
valid_df['text'] = valid_df['text'].apply(remove_stopwords)
test_df['text'] = test_df['text'].apply(remove_stopwords)
train_df.head()

### Classical Models with TF-IDF, SVM, OneVsRest Classifer

In [None]:
# Create an instance of the TfidfVectorizer class with n-grams of size 1 and 2, 
# and use the TweetTokenizer to tokenize the text
vectorizer = TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
# Combine the text from the train and test dataframes into a list
full_text = list(train_df['text'].values) + list(test_df['text'].values)
# Fit the vectorizer on the combined text
vectorizer.fit(full_text)
# Use the fitted vectorizer to transform the text in the train, test and validation dataframe into a sparse matrix of TF-IDF values
train_vectorized = vectorizer.transform(train_df['text'])
test_vectorized = vectorizer.transform(test_df['text'])
valid_vectors = vectorizer.transform(valid_df['text'])

In [None]:
y = train_df['label']

### Using OneVsRestClassifier and Logistic Regression

In [None]:
logreg = LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
ovr = OneVsRestClassifier(logreg)

In [None]:
%%time
ovr.fit(train_vectorized, y)

In [None]:
scores = cross_val_score(ovr, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

### Using SVM Model

In [None]:
%%time
svc = LinearSVC(dual=False)
scores = cross_val_score(svc, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

In [None]:
ovr.fit(train_vectorized, y);
svc.fit(train_vectorized, y);

### Changing the classifier by a LSTM and EMBEDDING model

In [None]:
# Create an instance of the Tokenizer class with options to lowercase the text and remove all filters
tk = Tokenizer(lower = True, filters='')
tk.fit_on_texts(full_text)

In [None]:
# Use the fitted tokenizer to convert the text in the train dataframe into a sequence of integer indexes
train_tokenized = tk.texts_to_sequences(train_df['text'])
# Use the fitted tokenizer to convert the text in the test dataframe into a sequence of integer indexes
test_tokenized = tk.texts_to_sequences(test_df['text'])

In [None]:
max_len = 100
X_train = pad_sequences(train_tokenized, maxlen = max_len)
X_test = pad_sequences(test_tokenized, maxlen = max_len)

We have used the `texts_to_sequences` method of the fitted tokenizer to convert the text in the `train_df['text']` and `test_df['text']` columns into sequences of `integer indexes`. Each unique word in the text is assigned a unique integer index based on its frequency in the text. The resulting sequences are then stored in the `train_tokenized` and `test_tokenized` variables, respectively. These integer sequences can be used as input to machine learning algorithms that require numerical input.

In [None]:
# Define the path to the pre-trained word embedding file
embedding_path = "/kaggle/input/fasttext-crawl-300d-2m/crawl-300d-2M.vec"
# Set the size of the word embeddings to 300
embed_size = 300
# Set the maximum number of features to 30,000
max_features = 30000

We have set up the parameters for the pre-trained word embeddings that will be used to initialize the embedding layer in the neural network. The `embedding_path` variable specifies the path to the file containing the pre-trained word embeddings. The `embed_size` variable specifies the size of the word embeddings, which is set to 300. The `max_features` variable specifies the maximum number of features (i.e. words) that will be included in the vocabulary, which is set to `30,000`. These parameters will be used later when defining the embedding layer in the neural network.

In [None]:
# This function returns a tuple with the word and its corresponding coefficients as a numpy array
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
# Load the pre-trained embeddings file and create a dictionary of word vectors
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))
# Get the index of each word in the tokenizer
word_index = tk.word_index
# Set the number of words to be used as the minimum between the maximum features allowed and the number of words in the tokenizer
nb_words = min(max_features, len(word_index))
# Initialize an embedding matrix of zeros with shape (nb_words + 1, embed_size)
embedding_matrix = np.zeros((nb_words + 1, embed_size))
# Iterate over each word in the tokenizer and its index
for word, i in word_index.items():
    # If the index of the word is greater than or equal to the maximum features allowed, skip it
    if i >= max_features: continue
    # Get the embedding vector of the word from the pre-trained embeddings dictionary
    embedding_vector = embedding_index.get(word)
    # If the embedding vector is not None, add it to the embedding matrix at the index of the word in the tokenizer
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

### Using OneHotEncoder

In [None]:
ohe = OneHotEncoder(sparse=False)
y_ohe = ohe.fit_transform(y.values.reshape(-1, 1))

### Model 1: GRU + CONV + LSTM

In [None]:
# Defining the model
def build_model1(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32):
    file_path = "best_model_1.h5"
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                  save_best_only = True, mode = "min")
    early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
    
    inp = Input(shape = (max_len,))
    x = Embedding(30001, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(spatial_dr)(x)

    x_gru = Bidirectional(CuDNNGRU(units, return_sequences = True))(x1)
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool1_gru = GlobalAveragePooling1D()(x1)
    max_pool1_gru = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool3_gru = GlobalAveragePooling1D()(x3)
    max_pool3_gru = GlobalMaxPooling1D()(x3)
    
    x_lstm = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x1)
    max_pool1_lstm = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool3_lstm = GlobalAveragePooling1D()(x3)
    max_pool3_lstm = GlobalMaxPooling1D()(x3)
    
    #x_lstm = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    #x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    #avg_pool2_lstm = GlobalAveragePooling1D()(x1)
    #max_pool2_lstm = GlobalMaxPooling1D()(x1)
    
    #x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    #avg_pool4_lstm = GlobalAveragePooling1D()(x3)
    #max_pool4_lstm = GlobalMaxPooling1D()(x3)
    
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool3_gru, max_pool3_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool3_lstm, max_pool3_lstm])
                    #avg_pool2_lstm, max_pool2_lstm, avg_pool4_lstm, max_pool4_lstm])
    
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
    x = Dense(2, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy",
                  optimizer = AdamW(lr = lr, weight_decay = lr_d),
                  metrics = ["accuracy"])
    history = model.fit(X_train, y_ohe, batch_size = 128, epochs = 30, validation_split=0.1,
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model

In [None]:
model1 = build_model1(lr = 1e-3,
                      lr_d = 1e-10,
                      units = 64,
                      spatial_dr = 0.3,
                      kernel_size1=3,
                      kernel_size2=2,
                      dense_units=32,
                      dr=0.2,
                      conv_size=64)

In [None]:
model2 = build_model1(lr = 1e-3,
                      lr_d = 1e-10,
                      units = 128,
                      spatial_dr = 0.5,
                      kernel_size1=3,
                      kernel_size2=2,
                      dense_units=64,
                      dr=0.3,
                      conv_size=64)

### Model 2: GRU + CONV + LSTM + ATTENTION

In [None]:
def build_model2(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32):
    file_path = "best_model_2.h5"
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                  save_best_only = True, mode = "min")
    early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

    inp = Input(shape = (max_len,))
    x = Embedding(30001, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(spatial_dr)(x)

    x_gru = Bidirectional(CuDNNGRU(units, return_sequences = True))(x1)
    x_lstm = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    
    x_conv1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool1_gru = GlobalAveragePooling1D()(x_conv1)
    max_pool1_gru = GlobalMaxPooling1D()(x_conv1)
    
    x_conv2 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool2_gru = GlobalAveragePooling1D()(x_conv2)
    max_pool2_gru = GlobalMaxPooling1D()(x_conv2)
    
    
    x_conv3 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x_conv3)
    max_pool1_lstm = GlobalMaxPooling1D()(x_conv3)
    
    x_conv4 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool2_lstm = GlobalAveragePooling1D()(x_conv4)
    max_pool2_lstm = GlobalMaxPooling1D()(x_conv4)
    
    x_conv5 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool3_lstm = GlobalAveragePooling1D()(x_conv5)
    max_pool3_lstm = GlobalMaxPooling1D()(x_conv5)
    
    # Attention Mechanism
    attention_gru = Attention(max_len)([x_gru, x_gru])
    attention_lstm = Attention(max_len)([x_lstm, x_lstm])
    
    # Flatten layers for attention layers
    attention_gru = Flatten()(attention_gru)
    attention_lstm = Flatten()(attention_lstm)
    
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool2_gru, max_pool2_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool2_lstm, max_pool2_lstm,
                    avg_pool2_lstm, max_pool3_lstm, attention_gru, attention_lstm])
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
    x = Dense(2, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy",
                  optimizer = AdamW(lr = lr, weight_decay = lr_d),
                  metrics = ["accuracy"])
    history = model.fit(X_train, y_ohe, batch_size = 128, epochs = 20, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model

In [None]:
model3 = build_model2(lr = 1e-4,
                      lr_d = 1e-7,
                      units = 64,
                      spatial_dr = 0.5,
                      kernel_size1=4,
                      kernel_size2=3,
                      dense_units=32,
                      dr=0.2,
                      conv_size=64)

In [None]:
model4 = build_model2(lr = 1e-3,
                      lr_d = 1e-5,
                      units = 64,
                      spatial_dr = 0.5,
                      kernel_size1=3,
                      kernel_size2=3,
                      dense_units=64,
                      dr=0.3,
                      conv_size=64)

In [None]:
model5 = build_model2(lr = 1e-3,
                      lr_d = 1e-7,
                      units = 64,
                      spatial_dr = 0.3,
                      kernel_size1=3,
                      kernel_size2=3,
                      dense_units=64,
                      dr=0.4,
                      conv_size=64)

### Model 3: GRU + LSTM + ATTENTION

In [None]:
def build_model3(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32):
    file_path = "best_model_3.h5"
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                  save_best_only = True, mode = "min")
    early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
    
    inp = Input(shape = (max_len,))
    x = Embedding(30001, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(spatial_dr)(x)

    x_gru = Bidirectional(CuDNNGRU(units, return_sequences = True))(x1)
    x_lstm = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    
    # Pad the shorter tensor with zeros along the time axis
    x_gru = ZeroPadding1D(padding=(2, 0))(x_gru)
    x_lstm = ZeroPadding1D(padding=(0, 2))(x_lstm)
    
    e1 = TimeDistributed(Dense(units*2, activation='tanh'))(x_gru)
    e2 = TimeDistributed(Dense(units*2, activation='tanh'))(x_lstm)
    e = Concatenate()([e1, e2])
    score = Dense(1)(e)
    attention_weights = Activation('softmax')(score)
    context_vector = Dot(axes=1)([attention_weights, x_gru])
    x = concatenate([context_vector, x_lstm], axis=1)
    
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x)
    avg_pool1 = GlobalAveragePooling1D()(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x)
    avg_pool3 = GlobalAveragePooling1D()(x3)
    max_pool3 = GlobalMaxPooling1D()(x3)
    
    x = concatenate([avg_pool1, max_pool1, avg_pool3, max_pool3])
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
    x = Dense(2, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy",
                  optimizer = AdamW(lr = lr, weight_decay = lr_d),
                  metrics = ["accuracy"])
    history = model.fit(X_train, y_ohe, batch_size = 128, epochs = 20, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model

In [None]:
model6 = build_model3(lr = 1e-4,
                      lr_d = 1e-7,
                      units = 128,
                      spatial_dr = 0.5,
                      kernel_size1=4,
                      kernel_size2=3,
                      dense_units=32,
                      dr=0.2,
                      conv_size=64)

In [None]:
model7 = build_model3(lr = 1e-3,
                      lr_d = 1e-10,
                      units = 128,
                      spatial_dr = 0.5,
                      kernel_size1=3,
                      kernel_size2=2,
                      dense_units=64,
                      dr=0.3,
                      conv_size=64)

In [None]:
pred1 = model1.predict(X_test, batch_size = 1024, verbose = 1)
pred = pred1
pred2 = model2.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred2
pred3 = model3.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred3
pred4 = model4.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred4
pred5 = model5.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred5
pred6 = model6.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred6
pred7 = model7.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred7

In [None]:
predictions = np.round(np.argmax(pred, axis=1)).astype(int)
sub['label'] = predictions
sub.to_csv("Submission_test.csv", index=False)

### Using a pre-trained model AKA Roberta

In [None]:
model_nm = "microsoft/deberta-v3-small"

In [None]:
from transformers import AutoModelForSequenceClassification , AutoTokenizer
Tokenizer = AutoTokenizer.from_pretrained(model_nm, use_fast=True)

In [None]:
def tok_func(x): return Tokenizer(x["text"],truncation=True,max_length=512)
from datasets import Dataset, DatasetDict
def create_dataset(df):
    ds = Dataset.from_pandas(train_df)
    if 'label' in train_df.columns:
        ds = ds.rename_columns({'label': 'labels'})
    ds = ds.map(tok_func, batched=True)
    return ds
Tokenizer_ds = create_dataset(train_df)

In [None]:
test_df = test_df.rename(columns={"text":"input"})
test_df

In [None]:
valid_df = valid_df.rename(columns={"text":"input"})
valid_df["label"] = valid_df.label.astype(float)
valid_df

In [None]:
valid_ds = create_dataset(valid_df)
test_ds = create_dataset(test_df)

In [None]:
bs = 32
epochs = 5
lr = 8e-5 

def acc(preds): 
    predictions, labels = preds
    predictions = (predictions > 0.5).astype(int)
    return {'accuracy': accuracy_score(labels, predictions)}

args = TrainingArguments("outputs",
                         learning_rate=lr,
                         warmup_ratio=0.1,
                         lr_scheduler_type="cosine",
                         fp16=True,
                         evaluation_strategy="epoch",
                         per_device_train_batch_size=bs,
                         per_device_eval_batch_size=bs*2,
                         num_train_epochs=epochs,
                         weight_decay=0.01,
                         report_to="none",
                         save_strategy='epoch',
                         load_best_model_at_end=True,)

model = AutoModelForSequenceClassification.from_pretrained(model_nm,num_labels=1)
trainer = Trainer(model,
                  args,
                  train_dataset=Tokenizer_ds,
                  eval_dataset=valid_ds,
                  tokenizer=Tokenizer,
                  compute_metrics=acc)

In [None]:
trainer.train()

In [None]:
trainer.save_model("/kaggle/working/Pretrained_model")
tokz.save_pretrained("/kaggle/working/Pretrained_model")

In [None]:
trainer.evaluate()

In [None]:
preds = trainer.predict(test_ds).predictions
preds

In [None]:
preds = (preds > 0.5).squeeze().astype(int)
submission = datasets.Dataset.from_dict({
    'id': test_df.index,
    'label': preds
})
submission.to_csv('submission.csv', index=False)