In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

### Read the dataset

In [None]:
data = pd.read_csv('/content/drive/MyDrive/reviews_data_dump/reviews_badminton/data.csv')
data.head()

## Exploratory Data Analysis

In [None]:
data.shape

### Check Duplicate values

In [None]:
duplicate_rows = data[data.duplicated()]

### Check Null values

In [None]:
data.isna().sum()

In [None]:
data.dropna(subset=['Review text'], inplace=True)

In [None]:
## Check the shape of the dataset
data.shape

### View some review text

In [None]:
data.loc[0, 'Review text']

In [None]:
data.loc[1, 'Review text']

In [None]:
data.loc[2, 'Review text']

* **Here, we can see that in the review, 'READ MORE' is present. It seems the data is not properly scrapped.**

In [None]:
## Check the distribution of Ratings
sns.countplot(x='Ratings', data=data)

In [None]:
# Distribution on target variable looks non-uniform

data['Ratings'].value_counts(normalize=True)

In [None]:
data.info()

##### Here, we can see that the Review Text is of object type. It contains numbers or emojis, which is why its type is object. Therefore, we can convert it to a string datatype.

In [None]:
data['Review text'] = data['Review text'].astype(str)

### Visualize the text with a word cloud to see which words have a higher frequency.

In [None]:
# Visuallizing an overall Word Cloud from the given data

from wordcloud import WordCloud

wc = WordCloud(background_color='black',
               width=1600,
               height=800).generate(' '.join(data['Review text']))


import matplotlib.pyplot as plt

plt.figure(1,figsize=(30,20))
plt.imshow(wc)
plt.axis('off')
plt.show()

**Here, we can observe that the most frequent text values are 'READ' and 'GOOD.' This is because most of the reviews are positive. However, the reviews are not proper; many contain 'READ MORE.' Consequently, 'Read' and 'Good' are the frequent values present in the dataset.**

## Identify Input and Output

In [None]:
X = data[['Review text']] # the column text contains textual data to extract features from
y = data['Ratings'] # this is the column we are learning to predict.

print(X.shape, y.shape)

## Split the data into train and test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y ,random_state=1)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

### Data Prepration - Text Preprocessing

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [None]:
nltk.download('stopwords')
# Downloading wordnet before applying Lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
## initialise the inbuilt Stemmer
stemmer = PorterStemmer()

In [None]:
## We can also use Lemmatizer instead of Stemmer
lemmatizer = WordNetLemmatizer()

#### Define a function to remove redundant text and clean the text.

In [None]:
import string
def preprocess(raw_text, flag):
    # Removing special characters and digits
    sentence = re.sub("[^a-zA-Z]", " ", raw_text)

    # This text contains a lot of READ MORE. So replace this values
    sentence = sentence.replace("READ MORE", " ")

    # Remove punctuation and numbers.
    sentence = "".join([char for char in sentence if char not in string.punctuation and not char.isdigit()])

    # change sentence to lower case
    sentence = sentence.lower()

    # tokenize into words
    tokens = sentence.split()

    # remove stop words
    clean_tokens = [t for t in tokens if not t in stopwords.words("english")]

    # Stemming/Lemmatization
    if(flag == 'stem'):
        clean_tokens = [stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]

    return pd.Series([" ".join(clean_tokens)])

In [None]:
from tqdm import tqdm, tqdm_notebook

In [None]:
# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`

tqdm.pandas()

In [None]:
# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`

temp_df = X_train['Review text'].progress_apply(lambda x: preprocess(x, 'lemma'))
temp_df.head()


In [None]:
temp_df.columns = ['clean_review']

temp_df.head()

In [None]:
X_train = pd.concat([X_train, temp_df], axis=1)

X_train.head()

In [None]:
## View some text of clean review text
X_train.loc[1,'clean_review']

In [None]:
### Let's see the Wordcloud for the clean review
wc = WordCloud(background_color='black',
               width=1600,
               height=800).generate(' '.join(X_train['clean_review']))

plt.figure(1,figsize=(30,20))
plt.imshow(wc)
plt.axis('off')
plt.show()

### Converting Text to Numerical vectors - BOW Representation
* **Step 1** - Learn the vocabulary from the train data.
* **Step 2** - Transform the train['review text'] to X_train (i.e. DTM).
* **Step 3** - Transform the test['review text'] to X_test (i.e. DTM).

In [None]:
X_train.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vocab = CountVectorizer()

X_train_bow = vocab.fit_transform(X_train['clean_review'])

In [None]:
X_train_bow

In [None]:
print("Total unique words:", len(vocab.vocabulary_))

print("Type of train features:", type(X_train_bow))

print("Shape of input data:", X_train_bow.shape)

In [None]:
print(X_train_bow.toarray())

### Preprocessing the Test Data

In [None]:
X_test.head()

In [None]:
temp_df = X_test['Review text'].progress_apply(lambda x: preprocess(x, 'lemma'))

temp_df.head()

In [None]:
temp_df.columns = ['clean_review']

temp_df.head()

In [None]:
X_test = pd.concat([X_test, temp_df], axis=1)

X_test.head()

In [None]:
X_test_bow = vocab.transform(X_test['clean_review'])

### Create the model and trained the model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix,classification_report
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
lr = LogisticRegression()
lr.fit(X_train_bow, y_train)

y_test_pred = lr.predict(X_test_bow)

# Caculate Accuracy
print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))


### Converting Text to Numerical vectors - Word2Vec Representation

**Step 1** - Import `Word2Vec` module from `gensim.models`  
**Step 2** - Convert the sentences to the List of Words (i.e. List of Tokens)  
**Step 3** - Use Word2Vec to learn numerical vectors for each unique words. Word2Vec uses the list of tokens and generate 300Dimensional numerical vector for each unique word.  
**Step 4** - Convert the word vectors to document vectors.

In [None]:
! pip install --upgrade gensim

In [None]:
import gensim

print(gensim.__version__)

In [None]:
from gensim.models import Word2Vec

In [None]:
## tokenized the sentences into words
X_train['tokenised_sentences'] = X_train['clean_review'].apply(lambda sent : sent.split())

X_train.head()

In [None]:
# train model

model = Word2Vec(list(X_train.tokenised_sentences), vector_size=300, min_count=1)

In [None]:
print(model)

In [None]:
# Checking the shape of vectors learned by the model

print(model.wv.__getitem__(model.wv.index_to_key).shape)

In [None]:
def document_vector(doc, keyed_vectors):
    """Remove out-of-vocabulary words. Create document vectors by averaging word vectors."""
    vocab_tokens = [word for word in doc if word in keyed_vectors.index_to_key]

    if not vocab_tokens:
        # If there are no tokens in the vocabulary, return a zero vector
        return np.zeros(keyed_vectors.vector_size)

    return np.mean(keyed_vectors.__getitem__(vocab_tokens), axis=0)

In [None]:
X_train['doc_vector'] = X_train.tokenised_sentences.progress_apply(lambda x : document_vector(x, model.wv))

In [None]:
X_train.head()

In [None]:
X_train_w2v = list(X_train.doc_vector)

### Preprocessing the Test Data

In [None]:
X_test.head()

In [None]:
X_test['tokenised_sentences'] = X_test['clean_review'].apply(lambda sent: sent.split())
X_test.head()

In [None]:
X_test['doc_vector'] = X_test.tokenised_sentences.progress_apply(lambda x : document_vector(x, model.wv))

In [None]:
X_test.head()

In [None]:
X_test_w2v = list(X_test.doc_vector)

### Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train_w2v, y_train)

y_test_pred = dt.predict(X_test_w2v)

# Caculate Accuracy
print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))


### Pretrained GloVe for Sentence Vector

In [None]:
import gensim
import gensim.downloader as api

print(gensim.__version__)

print(list(gensim.downloader.info()['models'].keys()))

In [None]:
wv = api.load('glove-twitter-50')

In [None]:
# Total number of vocabulary words in pretrained model

len(wv.index_to_key)

In [None]:
X_train.head()

In [None]:
X_train['doc_vector_pretrained_glove'] = X_train.tokenised_sentences.progress_apply(lambda x : document_vector(x, wv))

In [None]:
X_train.head()

In [None]:
X_train_glove_pretrained = list(X_train.doc_vector_pretrained_glove)

In [None]:
X_test.head()

In [None]:
X_test['doc_vector_pretrained_glove'] = X_test.tokenised_sentences.progress_apply(lambda x : document_vector(x, wv))

In [None]:
X_test_glove_pretrained = list(X_test.doc_vector_pretrained_glove)

### Support Vector Classifier

In [None]:
svm = SVC()
svm.fit(X_train_glove_pretrained, y_train)

y_test_pred = svm.predict(X_test_glove_pretrained)

# Caculate Accuracy
print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))


## Pretrained BERT for Sentence Vectors

In [None]:
! pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
X_train['doc_vector_pretrained_bert'] = X_train.clean_review.progress_apply(model.encode)

X_train.head()

In [None]:
X_train_bert_pretrained = list(X_train.doc_vector_pretrained_bert)

In [None]:
X_test['doc_vector_pretrained_bert'] = X_test.clean_review.progress_apply(model.encode)

In [None]:
X_test_bert_pretrained = list(X_test.doc_vector_pretrained_bert)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

classifier = LogisticRegression()
classifier.fit(X_train_bert_pretrained, y_train)

y_test_pred = classifier.predict(X_test_bert_pretrained)

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

classifier = DecisionTreeClassifier()
classifier.fit(X_train_bert_pretrained, y_train)

y_test_pred = classifier.predict(X_test_bert_pretrained)

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

**Here, we can see that different algorithms give different results, and the model is also overfitting. Therefore, we can create a pipeline in which we try different models with hyperparameter tuning.**

### Let's extract the clean reviews for both x_train and x_test

In [None]:
X_train_clean = X_train['clean_review']
X_test_clean = X_test['clean_review']

In [None]:
## Define a class in which BertTransformer and GloveVectorizer are initialized.
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import KeyedVectors

class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = SentenceTransformer(self.model_name)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.model.encode(text) for text in X]


class GloVeVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model=None):
        self.model = model

    def fit(self, X, y=None):
        # No fitting necessary for pre-trained embeddings
        return self

    def transform(self, X):
        return np.vstack([self.document_vector(doc) for doc in tqdm(X)])

    def document_vector(self, doc):
        """Remove out-of-vocabulary words. Create document vectors by averaging word vectors."""
        # Filter out-of-vocabulary words
        vocab_tokens = [word for word in doc if word in self.model]

        if not vocab_tokens:
            # If there are no tokens in the vocabulary, return a zero vector
            return np.zeros(self.model.vector_size)

        # Compute the mean vector of the tokens
        return np.mean(self.model[vocab_tokens], axis=0)


## Creating an Optimal Workflow

Pipeline allows you to sequentially apply a list of transformers to preprocess the data and, if desired, conclude the sequence with a final predictor for predictive modeling.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import joblib
from joblib import Memory

import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

## Caching to optimize the runtime
Caching is a technique used in software engineering to improve performance by storing frequently accessed or computed data in a temporary storage location, known as a cache. The purpose of caching is to reduce the time and resources required to access or compute the same data repeatedly.

We can create the **Memory** object to cache the intermediate results of `vectorization`.

This will avoid redundant computations of vectorization during the grid search, thereby optimizing the runtime. Adjust the cachedir parameter as needed to specify the location for caching the results.


As a software engineer, besides caching, you can employ several techniques to improve the time complexity of your code. 
Implement **memoization** to store the results of expensive function calls and reuse them when the same inputs occur again. This is particularly useful for dynamic programming problems.

### RandomForest Classifier Implementation with hyper-parameters

In [None]:
# Define a memory object to cache intermediate results
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

# Extend the pipelines dictionary
pipelines = {
    'random_forest': Pipeline([
        ('vectorization', TfidfVectorizer()),
        ('classifier', RandomForestClassifier())
    ], memory=memory),
}

# Define parameter grid for each algorithm
param_grids = [
    {
        'vectorization': [TfidfVectorizer(),GloVeVectorizer(model=wv)],
        # 'vectorization__max_features': [1000, 1500, 2000],
        'classifier__n_estimators': [50,100,200,500],
        'classifier__max_depth': [5,7,10]
    }
]

clf = GridSearchCV(
    estimator=pipelines['random_forest'],
    param_grid=param_grids,
    scoring='f1_weighted',
    cv=5,
    return_train_score=True,
    verbose=1,
)

# Assuming X_train_clean and y_train are defined somewhere in your code
clf.fit(X_train_clean, y_train)

# Check for any errors or warnings during grid search

print("Best estimator found on train set")
print(clf.best_estimator_)
print()
print('Score on Train Data: ',clf.best_score_)
print('Score on Test Data: ', clf.score(X_test_clean, y_test))

#### Saved the trained model in best_models folder

In [None]:
# Serialization
best_model = clf.best_estimator_
joblib.dump(best_model,'best_models/demo_model_rfc_hpy.pkl')
# joblib.dump(best_model, 'best_models/demo_model_rfc_hpy.pkl')

### Check the prediction

In [None]:
model = joblib.load('best_models/demo_model_rfc_hpy.pkl')

In [None]:
new_data = [
    """Inflation in the 20-nation euro zone eased to 2.6% in February,
    flash figures showed on Friday, but both the headline and core
    figures were higher than expected.
    Economists polled by Reuters had forecast a headline reading of 2.5%.
    Core inflation, stripping out volatile components of energy, food,
    alcohol and tobacco, was 3.1% — above the 2.9% expected.
    The European Union statistics agency said food, alcohol and tobacco
    had the highest inflation rate in February at 4%, followed by services
    at 3.9%."""]

prediction = model.predict(new_data)

print("Prediction:", prediction)

### KNN Implementation with hyper-parameters

In [None]:
from sklearn.neighbors import KNeighborsClassifier as KNN

# Define a memory object to cache intermediate results
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

# Extend the pipelines dictionary
pipelines = {
    'knn': Pipeline([
        ('vectorization', TfidfVectorizer()),
        ('classifier', KNN())
    ], memory=memory),
}

# Define parameter grid for each algorithm
param_grids = [
        {
            'vectorization': [TfidfVectorizer(), GloVeVectorizer(model=wv)],
            'classifier__n_neighbors' : [i for i in range(3, 21, 2)],
            'classifier__p' : [1, 2, 3]
        }
]

clf = GridSearchCV(
    estimator=pipelines['knn'],
    param_grid=param_grids,
    scoring='f1_weighted',
    cv=5,
    return_train_score=True,
    verbose=1,
)

# Assuming X_train_clean and y_train are defined somewhere in your code
clf.fit(X_train_clean, y_train)

# Check for any errors or warnings during grid search

print("Best estimator found on train set")
print(clf.best_estimator_)
print()
print('Score on Train Data: ',clf.best_score_)
print('Score on Test Data: ', clf.score(X_test_clean, y_test))

In [None]:
# Serialization
best_model = clf.best_estimator_
joblib.dump(best_model,'best_models/demo_model_knn_hpy.pkl')

## Putting it all together: Implementing various Algorithms to find the Best Model

In [None]:
from sklearn.naive_bayes import MultinomialNB
# Define a memory object to cache intermediate results
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

pipelines = {
    'naive_bayes': Pipeline([
        ('vectorization',TfidfVectorize() ),
        ('classifier', MultinomialNB())
    ], memory=memory),
    'decision_tree': Pipeline([
        ('vectorization', TfidfVectorize()),
        ('classifier', DecisionTreeClassifier())
    ], memory=memory),
    'logistic_regression': Pipeline([
        ('vectorization', TfidfVectorize()),
        ('classifier', LogisticRegression())
    ], memory=memory),
    'random_forest':Pipeline([
        ('vectorization',TfidfVectorize()),
        ('classifier',RandomForestClassifier())
    ],memory=memory),
    'svc': Pipeline([
        ('vectorization', TfidfVectorize()),
        ('classifier', SVC())
    ],memory = memory),
    'knn': Pipeline([
        ('vectorization', TfidfVectorize()),
        ('classifier', KNN)
    ],memory = memory)

}
# Define parameter grid for each algorithm
param_grids = {
    'naive_bayes': [
        {
            'vectorization': [TfidfVectorize(),GloVeVectorizer(model=wv)],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__alpha' : [1, 10]
        }
    ],
    'decision_tree': [
        {
            'vectorization': [TfidfVectorizer(),GloVeVectorizer(model=wv)],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__max_depth': [None, 5, 10]
        }
    ],
    'logistic_regression': [
        {
            'scaler': [TfidfVectorizer(), GloVeVectorizer(model=wv)],
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['l2']
        },
        {
            'scaler': [TfidfVectorizer(), GloVeVectorizer(model=wv)],
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['l1'],
            'classifier__solver': ['liblinear']
        },
        {
            'scaler': [TfidfVectorizer(), GloVeVectorizer(model=wv)],
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['elasticnet'],
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga']
        }
    ],
        'random_forest': [
        {
            'scaler': [TfidfVectorizer(), GloVeVectorizer(model=wv)],
            'classifier__n_estimators': [50, 100, 200]
        }
    ],
    'svc': [
        {
            'scaler': [TfidfVectorizer(), GloVeVectorizer(model=wv)],
            'classifier__kernel' : ['rbf'],
            'classifier__C' : [0.1, 0.01, 1, 10, 100]
        },
        {
            'scaler': [TfidfVectorizer(), GloVeVectorizer(model=wv)],
            'classifier__kernel' : ['poly'],
            'classifier__degree' : [2, 3, 4, 5],
            'classifier__C' : [0.1, 0.01, 1, 10, 100]
        },
        {
            'scaler': [TfidfVectorizer(), GloVeVectorizer(model=wv)],
            'classifier__kernel' : ['linear'],
            'classifier__C' : [0.1, 0.01, 1, 10, 100]
        }
    ],
    'knn': [
        {
            'scaler': [TfidfVectorizer(), GloVeVectorizer(model=wv)],
            'classifier__n_neighbors' : [i for i in range(3, 21, 2)],
            'classifier__p' : [1, 2, 3]
        }
    ]
}

# Perform GridSearchCV for each algorithm
best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo],
                               param_grid=param_grids[algo],
                               cv=5,
                               scoring='f1_weighted',
                               return_train_score=True,
                               verbose=1
                              )

    %time grid_search.fit(X_train_clean, y_train)

    best_models[algo] = grid_search.best_estimator_

    print('Score on Test Data: ', grid_search.score(X_test_clean, y_test))

In [None]:
for name, model in best_models.items():
    print(f"{name}")
    print(f"{model}")
    print()

In [None]:
## saved all the trained model in best_models folder
for name, model in best_models.items():
    print("*"*10, name, "*"*10)
    
    joblib.dump(model, f'best_models/{name}.pkl')
    model = joblib.load(f'best_models/{name}.pkl')
    
    %time y_test_pred = model.predict(X_test_clean)
    print("Test Score (F1)", metrics.f1_score(y_test, y_test_pred))
    
    print("Model Size:", os.path.getsize(f'best_models/{name}.pkl'), "Bytes")

## Deep Learning architecture in which we use a bidirectional LSTM model

In [None]:
# Encode the sentences using BERT
X_train_bert_embeddings = np.array([model.encode(text) for text in X_train_clean])
X_test_bert_embeddings = np.array([model.encode(text) for text in X_test_clean])

In [None]:
X_train_bert_embeddings

In [None]:
# Deep Learning
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(
    monitor="accuracy",
    patience=5,
    mode ="max",
    verbose=2,
    restore_best_weights=True
)

In [None]:
# embedding_dimension=100,
# max_length = 48
# vocab_size=1000
# embeddings_matrix = np.zeros((vocab_size, embedding_dimension))


# Reshape the BERT embeddings to match the expected input shape of the Bidirectional layer
X_train_bert_embeddings = X_train_bert_embeddings[:, np.newaxis, :]
X_test_bert_embeddings = X_test_bert_embeddings[:, np.newaxis, :]


In [None]:
lstm_model = Sequential([
    tf.keras.layers.Bidirectional(LSTM(units=128, return_sequences=True),  input_shape=(X_train_bert_embeddings.shape[1], X_train_bert_embeddings.shape[2])),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Bidirectional(LSTM(units=128, return_sequences=True)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Bidirectional(LSTM(units=128)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(3, activation='softmax'),
])

In [None]:
lstm_model.summary()

In [None]:
tf.keras.utils.plot_model(lstm_model, show_shapes=True)

In [None]:
lstm_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'],
    run_eagerly=True
)

In [None]:
#
history = lstm_model.fit(
    X_train_bert_embeddings,
    y_train,
    epochs=100,
    batch_size=50,
    validation_split=0.2
)

In [None]:
# Save the model in HDF5 format
lstm_model.save("best_models/lstm_model.h5")

# Load the model
loaded_model = tf.keras.models.load_model("best_models/lstm_model.h5")
