In [None]:
import torch
import torch.nn as nn

In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 10

In [None]:
class AmazonDataset(torch.utils.data.Dataset):
    
    def __init__(self, reviews, targets):

        self.reviews = reviews
        self.target = targets
    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
 
        review = self.reviews[item, :]
        target = self.target[item]
        return {
            "review": torch.tensor(review, dtype=torch.long),
            "target": torch.tensor(target, dtype=torch.float)
        }

In [None]:
class LSTM(nn.Module):
    def __init__(self, embedding_matrix):
        super(LSTM, self).__init__()
        
        num_words = embedding_matrix.shape[0]
        
        embed_dim = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(
        num_embeddings=num_words,
        embedding_dim=embed_dim
        )
        
        self.embedding.weight = nn.Parameter(
            torch.tensor(
            embedding_matrix,
            dtype=torch.float32
            )
        )
        
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(
             embed_dim,
             128,
             bidirectional=True,
             batch_first=True,
             )
        self.out = nn.Linear(512, 1)
    def forward(self, x):
         
        x = self.embedding(x)
        x, _ = self.lstm(x)
        avg_pool = torch.mean(x, 1)
        max_pool, _ = torch.max(x, 1)

        out = torch.cat((avg_pool, max_pool), 1)
        out = self.out(out)
        return out


In [None]:
def train(data_loader, model, optimizer, device):
    model.train()
    for data in data_loader:
        reviews = data["review"]
        targets = data["target"]
        reviews = reviews.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        optimizer.zero_grad()
        predictions = model(reviews)
        loss = nn.BCEWithLogitsLoss()(
                 predictions,
                 targets.view(-1, 1)
                 )
        loss.backward()
        optimizer.step()

def evaluate(data_loader, model, device):
    final_predictions = []
    final_targets = []
    model.eval()
    with torch.no_grad():
        for data in data_loader:
            reviews = data["review"]
            targets = data["target"]
            reviews = reviews.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)
            predictions = model(reviews)

            predictions = predictions.cpu().numpy().tolist()
            targets = data["target"].cpu().numpy().tolist()
            final_predictions.extend(predictions)
            final_targets.extend(targets)
        
    return final_predictions, final_targets


# main.py

In [None]:
import io
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [None]:
def load_vectors(fname):
    fin = io.open(
        fname,
        'r',
        encoding='utf-8',
        newline='\n',
        errors='ignore'
    )
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data

In [None]:
def create_embedding_matrix(word_index, embedding_dict):
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        if word in embedding_dict:
            embedding_matrix[i] = embedding_dict[word]
        
    return embedding_matrix

In [None]:
df = pd.read_csv('../input/amazon-dataset-csv-generator/PolynomialInternshipDrive2022.csv')
df_train, df_valid = train_test_split(df, test_size=0.3, random_state=42, stratify=df.overall.values)
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)

tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df.review.values.tolist())

xtrain = tokenizer.texts_to_sequences(df_train.review.values)
xtest = tokenizer.texts_to_sequences(df_valid.review.values)

xtrain = tf.keras.preprocessing.sequence.pad_sequences(
 xtrain, maxlen=MAX_LEN
 )

xtest = tf.keras.preprocessing.sequence.pad_sequences(
 xtest, maxlen=MAX_LEN)

train_dataset = AmazonDataset(
 reviews=xtrain,
 targets=train_df.overall.values
 )
train_data_loader = torch.utils.data.DataLoader(
 train_dataset,
 batch_size=TRAIN_BATCH_SIZE,
 num_workers=0
 )
valid_dataset = dataset.IMDBDataset(
 reviews=xtest,
 targets=valid_df.overall.values
 )
valid_data_loader = torch.utils.data.DataLoader(
 valid_dataset,
 batch_size=VALID_BATCH_SIZE,
 num_workers=0
 )
print("Loading embeddings")

embedding_dict = load_vectors("../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec")
embedding_matrix = create_embedding_matrix(
    tokenizer.word_index, embedding_dict
 )
device = torch.device("cuda")
model = lstm.LSTM(embedding_matrix)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
print("Training Model")

best_accuracy = 0
early_stopping_counter = 0

for epoch in range(EPOCHS):

    engine.train(train_data_loader, model, optimizer, device)
    
    outputs, targets = engine.evaluate(
    valid_data_loader, model, device
    )
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    print(
    f"Epoch: {epoch}, Accuracy Score = {accuracy}"
    )
    # simple early stopping
    if accuracy > best_accuracy:
        best_accuracy = accuracy
    else:
        early_stopping_counter += 1
    if early_stopping_counter > 2:
        break