In [1]:
import re
import pandas as pd

In [2]:
df_train = pd.read_excel('./NLP test data for assignment 0324 (2).xlsb', sheet_name = 'training', engine = 'pyxlsb')
df_validation = pd.read_excel('./NLP test data for assignment 0324 (2).xlsb', sheet_name = 'validation', engine = 'pyxlsb')

In [3]:
def clean_data(text):
    text = re.sub(r'https\S+','',text)
    text = re.sub(r'@\w+','',text)
    text = re.sub(r'#\w+','',text)
    text = re.sub(r'\n','',text)
    text = re.sub(r'[^A-Za-z0-9]','',text)
    return text.strip()

df_train['CleanTweet'] = df_train['OriginalTweet'].apply(clean_data)
df_validation['CleanTweet'] = df_validation.iloc[:,-1].apply(clean_data)

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df_train['SentimentEncoded'] = label_encoder.fit_transform(df_train['Sentiment'])

df_train.head(n = 2), df_validation.head(n = 2)

(   UserName  ScreenName     Location TweetAt  \
 0         1       44953          NYC   43864   
 1         2       44954  Seattle, WA   43864   
 
                                        OriginalTweet           Sentiment  \
 0  TRENDING: New Yorkers encounter empty supermar...  Extremely Negative   
 1  When I couldn't find hand sanitizer at Fred Me...            Positive   
 
                                           CleanTweet  SentimentEncoded  
 0  TRENDINGNewYorkersencounteremptysupermarketshe...                 0  
 1  WhenIcouldntfindhandsanitizeratFredMeyerIturne...                 4  ,
    UserName  ScreenName         Location     TweetAt  \
 0      1470       46422  Ontario, Canada  13-03-2020   
 1      1471       46423  Los Angeles, CA  13-03-2020   
 
                                           Unnamed: 4  \
 0  Hey idiots... \n\n\n\n\n\nWhen demand is great...   
 1  Which is your favorite Mad Max? The original, ...   
 
                                           CleanT

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader

##Initializing a BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class TweetDataset(Dataset):
    def __init__(self,tweets,labels,tokenizer,max_length=128):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.tweets)
    def __getitem__(self,idx):
        tweet = str(self.tweets[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            tweet,
            max_length = self.max_length,
            truncation = True,
            padding = 'max_length',
            return_token_type_ids = False,
            return_attention_mask = True,
            return_tensors = 'pt',
            add_special_tokens = True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label,dtype=torch.long)
        }

train_dataset = TweetDataset(
    tweets=df_train['CleanTweet'].values,
    labels=df_train['SentimentEncoded'].values,
    tokenizer=tokenizer
)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
    

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Train the model
trainer.train()