In [None]:
import torch.nn as nn
from transformers import RobertaModel
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
import pytorch_lightning as pl
from transformers import RobertaTokenizerFast
from torch.utils.data import DataLoader
import random
import transformers
import os, sys, re, uuid, time, warnings, pandas as pd, numpy as np, sklearn, nltk, logging, functools, time
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer, PorterStemmer
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from string import punctuation

import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)

In [None]:
class TextPreprocessor:

    def __init__(self):
        pass
    
    def decontract(text):
            text = re.sub(r"can\'t", "can not", text)
            text = re.sub(r"n\'t", " not", text)
            text = re.sub(r"\'re", " are", text)
            text = re.sub(r"\'s", " is", text)
            text = re.sub(r"\'d", " would", text)
            text = re.sub(r"\'ll", " will", text)
            text = re.sub(r"\'t", " not", text)
            text = re.sub(r"\'ve", " have", text)
            text = re.sub(r"\'m", " am", text)
            return text
    
    def clean_text(text):
        """
        ChatGPT
        """
        # Remove links
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'www\S+', '', text)
        
        # Remove mentions and hashtags
        text = re.sub(r'@\S+', '', text)
        text = re.sub(r'#\S+', '', text)

        # Remove emojis
        text = re.sub(r'[\U0001f600-\U0001f650]', '', text)
        
        # Remove punctuation and convert to lowercase
        text = ''.join([c for c in text if c not in punctuation])
        text = text.lower().strip()
        
        # Remove stop words and tokenize
        # stop_words = set(stopwords.words('english'))
        # tokens = word_tokenize(text)
        # tokens = [token for token in tokens if token not in stop_words]
        
        # # Join the tokens back into a string
        # text = ' '.join(tokens)
        
        return text


    def text_cleaning2(text: str) -> str:
        """
        Cleans the text.
        """
        # text_no_emo = re.sub(r'[\:\;\=]\s*[D\)\(\[\]\}\{@\|\\\/]', '', text) #remove emo text
        cleaned_text = re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", " ", text).strip().lower()
        return cleaned_text

    def text_cleaning(text: str) -> str:
        """
        Cleans the text.
        """
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'bit.ly/\S+', '', text)
        text = text.strip('[link]')
        # remove users
        text = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
        text = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
        # remove puntuation
        my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@â'
        text = re.sub('[' + my_punctuation + ']+', ' ', text)
        # remove number
        text = re.sub('([0-9]+)', '', text)
        # remove hashtag
        text = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
        return text.lower()
        
    def remove_stopwords(text: str) -> str:
        """
        Removes the stopwords from the text.
        """
        custom_stopwords = set(stopwords.words('english')) - {'not', 'no', 'never'}
        return ' '.join([word for word in text.split() if word not in custom_stopwords])

    def stemming(text: str) -> str:
        """
        Stems the text.
        """
        tokenized = nltk.word_tokenize(text)
        stemmer = PorterStemmer()
        return ' '.join([stemmer.stem(word) for word in tokenized])


    def lemmatization(text: str) -> str:
        """
        Lemmatization is the process of grouping together the inflected forms of a word.
        Parameters:
            text: str
        """
        lemmatizer = WordNetLemmatizer()
        return ' '.join([lemmatizer.lemmatize(word, 'v') for word in text.split()])

    def preprocess_text(text: str, stem=True) -> str:
        """
        Preprocesses the text.
        Parameters:
            text: the text to preprocess.
            stem: if True, stems the text.
            else, lemmatizes the text.
        """
        if stem:
            return TextPreprocessor.stemming(TextPreprocessor.remove_stopwords(TextPreprocessor.decontract(TextPreprocessor.clean_text(text))))
        else:
            return TextPreprocessor.lemmatization(TextPreprocessor.remove_stopwords(TextPreprocessor.decontract(TextPreprocessor.clean_text(text))))

In [None]:
class Config:
    PATH = '/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv'
    LR = 1e-5
    MAX_LEN = 64
    BATCH_SIZE = 64
    SEED = 42
    train_ratio = 0.9
    test_ratio = 0.1
    num_workers = 8
    roberta_model = "roberta-base"
    tokenizer = RobertaTokenizerFast.from_pretrained(roberta_model)

def log_execution(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        logging.info(f"Executing {func.__name__}")
        result = func(*args, **kwargs)
        logging.info(f"Finished executing {func.__name__}")
        return result
    return wrapper

def timing_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Function {func.__name__} took {end_time - start_time} seconds to run.")
        return result
    return wrapper

def seed_everything(seed=Config.SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)

In [None]:
class Sentiment140Dataset(Dataset):
    def __init__(self):
        self.path = Config.PATH
        self.tokenizer = Config.tokenizer
        self.cleaned = False
        self.load_data()
        
    @log_execution    
    def load_data(self):
        """
        Loads the data.
        """
        self.data = pd.read_csv(self.path, header=None, names=['targets', 'ids', 'date', 'flag', 'user', 'text'], 
                           encoding='latin-1')
        self.data.targets = self.data.targets.replace({4: 1})
        self.check_for_dups()
        # self.check_targets()
        # self.X, self.y = self.data.text, self.data.targets #Series
        # self.X, self.y = self.data.text.to_numpy(), self.data.targets.to_numpy().astype(np.uint8) #numpy
        self.X, self.y = self.data.text.tolist(), self.data.targets.tolist() #List

    @timing_decorator
    def deep_clean(self):
        # List: 370.0727105140686 seconds to run.
        # Series: 372.0254681110382 seconds to run.
        # Numpy: 371.67559838294983 seconds to run.
        # For list
        # Add stop words removal
        self.X =  list(map(TextPreprocessor.preprocess_text, self.X))
        # self.X =  list(map(TextPreprocessor.clean_text, self.X))
        # self.X =  list(map(TextPreprocessor.remove_stopwords, self.X))
        # self.X =  list(map(TextPreprocessor.stemming, self.X))
        # For Numpy
        # self.X = np.vectorize(TextPreprocessor.decontract)(self.X)
        # self.X = np.vectorize(TextPreprocessor.clean_text)(self.X)
        # self.X = np.vectorize(TextPreprocessor.stemming)(self.X)
        # # For Series
        # self.X = self.X.apply(TextPreprocessor.decontract)
        # self.X = self.X.apply(TextPreprocessor.clean_text)
        # self.X = self.X.apply(TextPreprocessor.stemming)

    def apply_cleaning(self):
        if not self.cleaned:  # check if data has been cleaned
            self.deep_clean()
            print("Done cleaning data")
            self.cleaned = True
    
    def find_max_len(self):
        self.max_len = self.data['text'].str.len().max()
        print("Maximum Length: ",self.max_len)
        
    def check_targets(self):
        print("Target value counts:", self.data.targets.value_counts())

    def check_for_dups(self):
        # print('number of duplicates: ', self.data.text.duplicated().sum())
        if self.data.text.duplicated().sum() > 0:
            self.data.drop_duplicates('text', inplace=True)
            # print("Done removing duplicates")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        self.apply_cleaning()
        X, y = self.X[i], self.y[i]
        encoding = self.tokenizer.encode_plus(
            X,
            add_special_tokens = True,
            max_length=Config.MAX_LEN,
            pad_to_max_length=True,
            truncation='longest_first',
            # truncation=True,
            # padding="max_length",
            return_tensors="pt",
        )
        input_ids = encoding["input_ids"][0] #[0]
        attention_mask = encoding["attention_mask"][0] #[0]
        labels =  torch.tensor(y, dtype=torch.float)
        return {'text': X,
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': labels
                }

In [None]:
class RoBERTaModel(pl.LightningModule):
    def __init__(self)-> None:
        super().__init__()
        self.prepare_loaders()
        self.roberta = RobertaModel.from_pretrained(Config.roberta_model)
        self.dropout = nn.Dropout(p=0.2)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, 1)
        self.loss_fn = nn.BCEWithLogitsLoss()

    def prepare_loaders(self):
        ds = Sentiment140Dataset()
        seed_everything()
        dataset_size = len(ds)
        indices = list(range(dataset_size))
        split = int(np.floor(Config.test_ratio * dataset_size))
        seed_everything()
        np.random.shuffle(indices)
        train_indices, test_indices = indices[split:], indices[:split]

        # create samplers for train and test sets
        train_sampler = SubsetRandomSampler(train_indices)
        test_sampler = SubsetRandomSampler(test_indices)

        # create data loaders for train and test sets
        self.train_loader = DataLoader(ds, batch_size=Config.BATCH_SIZE, sampler=train_sampler)
        self.val_loader = DataLoader(ds, batch_size=Config.BATCH_SIZE, sampler=test_sampler)

    def train_dataloader(self):
        return self.train_loader

    def val_dataloader(self):
        # self.prepare_data()
        return self.val_loader

    def forward(self, input_ids, attention_mask)-> torch.Tensor:
        output = self.roberta(input_ids=input_ids,
                              attention_mask=attention_mask)
        pooled_output = output.pooler_output
        # dropout_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)
    
    def accuracy(self, preds, labels):
        """
        Computes accuracy for binary classification task.
        """
        # round predictions to the closest integer
        rounded_preds = torch.round(torch.sigmoid(preds))
        # compute accuracy
        acc = (rounded_preds == labels).float().mean()
        return acc

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        outputs = self(input_ids, attention_mask)
        loss = self.loss_fn(outputs.view(-1), labels.view(-1))
        acc = self.accuracy(outputs.view(-1), labels.view(-1))
        self.log("train_loss", loss, prog_bar=True)
        self.log('train_acc', acc, prog_bar=True)
        return {"loss": loss, 
                "acc": acc}
        
    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        outputs = self(input_ids, attention_mask)
        loss = self.loss_fn(outputs.view(-1), labels.view(-1))
        acc = self.accuracy(outputs.view(-1), labels.view(-1))
        self.log("valid_loss", loss)
        self.log('valid_acc', acc, prog_bar=True)
        return {"loss": loss, 
                "acc": acc}

    def configure_optimizers(self):
        # optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        # return optimizer
    
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        return transformers.AdamW(optimizer_parameters, lr=Config.LR)

    def predict(self, text):
        encoded_text = Config.tokenizer.encode_plus(
            text,
            add_special_tokens = True,
            max_length=Config.MAX_LEN,
            pad_to_max_length=True,
            truncation='longest_first',
            # truncation=True,
            # padding="max_length",
            return_tensors="pt",
        )
        output = self(encoded_text['input_ids'][0], encoded_text['attention_mask'][0])
        probabilities = torch.softmax(output.logits, dim=1)
        predicted_label = torch.argmax(probabilities, dim=1)
        return predicted_label.item()

In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

early_stop_callback = EarlyStopping(
   monitor='val_loss',
   min_delta=0.00,
   patience=2,
   verbose=False,
   mode='min'
)

checkpoint_callback = ModelCheckpoint(
    monitor='valid_loss',
    dirpath='checkpoints',
    filename='model-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,
    mode='min',
)

torch.set_float32_matmul_precision('medium')
# training_args = pl.TrainingArguments(
#     ,
#     output_dir='results_roberta',          # output directory
#     overwrite_output_dir = True,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True
# )

model = RoBERTaModel()
trainer = pl.Trainer(accelerator='gpu',
                     max_epochs = 1,
                     callbacks=[checkpoint_callback]
)
trainer.fit(model)
best_model_path = checkpoint_callback.best_model_path
best_model = model.load_from_checkpoint(best_model_path)
torch.save(best_model.state_dict(), 'best_model.pt')

In [None]:
pred_model = model.load_from_checkpoint('checkpoints/model-epoch=00-val_loss=0.00.ckpt')

def replace_sentiment(pred):
    if pred == 0:
        return "Negative"
    elif pred == 1:
        return "Positive"
    else:
        return "Unknown"

def predict_single_sample(model, text):
    encoded_text = Config.tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        max_length=Config.MAX_LEN,
        pad_to_max_length=True,
        truncation='longest_first',
        return_tensors="pt",
    )
    input_ids = encoded_text["input_ids"] #[0]
    attention_mask = encoded_text["attention_mask"] #[0]
    model.eval()
    with torch.no_grad():
        print("Entered no Grad")
        output = model(input_ids.to('cuda'), attention_mask.to('cuda'))
        print("Passed output")
        pred = torch.argmax(output).item()
        sentiment = replace_sentiment(pred)
    return sentiment

sentiment = predict_single_sample(pred_model, "I dont hate you")
print(sentiment)