<a href="https://colab.research.google.com/github/fgs2/f20aa-2024/blob/main/cw2/transformers/BERT_lemmatized.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# This is so I don't have to keep uploading on Colab.
import requests
from requests.auth import HTTPBasicAuth

def downloadFileFromRepo(username, repository, branch, filepath, token):
    # Construct the URL to download the file from GitHub
    url = f"https://raw.githubusercontent.com/{username}/{repository}/{branch}/{filepath}"

    # Send a GET request to download the file
    response = requests.get(url, auth=HTTPBasicAuth(username, token))

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Extract the file name from the URL
        fileName = filepath.split('/')[-1]

        # Create the 'data' directory if it doesn't exist
        if not os.path.exists('data'):
            os.makedirs('data')

        # Define the file path within the 'data' directory
        localFilepath = os.path.join('data', fileName)

        # Write the file content to a local file
        with open(localFilepath, 'wb') as f:
            f.write(response.content)
        print(f"File '{fileName}' downloaded successfully.")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

username = ""
repository = ""
branch = ""
path_to_file = ""
repoToken = ""
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

# path_to_file = "cw2/data/trainStemmed.csv"
# downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/data/test.csv"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

# path_to_file = "cw2/data/testStemmed.csv"
# downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

# path_to_file = "cw2/lemmaTokenizer.json"
# downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

# path_to_file = "cw2/stemTokenizer.json"
# downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

File 'train.csv' downloaded successfully.
File 'test.csv' downloaded successfully.


In [15]:
df = pd.read_csv('data/train.csv')

In [16]:
df.keys()

Index(['overall', 'Review'], dtype='object')

In [17]:
texts = df['Review'].tolist()
labels = df['overall'].tolist()

In [18]:
class BERTClassifier(nn.Module):
  def __init__(self, bert_model_name, num_classes):
      super(BERTClassifier, self).__init__()
      self.bert = BertModel.from_pretrained(bert_model_name)
      self.dropout = nn.Dropout(0.1)
      self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

  def forward(self, input_ids, attention_mask):
      outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
      pooled_output = outputs.pooler_output
      x = self.dropout(pooled_output)
      logits = self.fc(x)
      return logits

In [19]:
class TextClassificationDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length):
          self.texts = texts
          self.labels = labels
          self.tokenizer = tokenizer
          self.max_length = max_length
  def __len__(self):
      return len(self.texts)
  def __getitem__(self, idx):
      text = str(self.texts[idx])
      label = self.labels[idx]
      encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
      return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label-1)}

In [20]:
def train(model, data_loader, optimizer, scheduler, device):
  model.train()
  for i,batch in enumerate(data_loader):
      optimizer.zero_grad()
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids, attention_mask)
      loss = nn.CrossEntropyLoss()(outputs, labels)
      if i % 100 == 0:
        print(f"Batch: {i}")
      loss.backward()
      optimizer.step()
      scheduler.step()

In [21]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [22]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
    return preds.item()

In [23]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 5
max_length = 128
batch_size = 128
num_epochs = 10
learning_rate = 2e-5

In [24]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [25]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)
# model.load_state_dict(torch.load('BERTEpoch1.bin'))

In [27]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [28]:
# !pip install numba

# from numba import cuda
# device = cuda.get_current_device()
# device.reset()

In [29]:
%%time
# !pip install notify
# from notify import notify
from IPython.display import clear_output

df_test = pd.read_csv('data/test.csv')


for epoch in range(num_epochs):
      print(f"Epoch {epoch + 1}/{num_epochs}")
      train(model, train_dataloader, optimizer, scheduler, device)
      accuracy, report = evaluate(model, val_dataloader, device)
      print(f"Validation Accuracy: {accuracy:.4f}")
      print(report)
      torch.save(model.state_dict(), f"/bert{epoch}e.pt")
      df_submission = pd.DataFrame()
      df_submission['id'] = df_test['id']
      for index, row in df_test.iterrows():
          value = predict_sentiment(row['Review'], model, tokenizer, device)
          df_submission.at[index, 'overall'] = value
      df_submission['overall'] = predict_sentiment(df_test['Review'], model, tokenizer, device)
      print(df_submission)
      df_submission.to_csv(f"submitKaggle{epoch}E.csv", index = False)
      print("An epoch has completed! Check your results.")

Collecting notify
  Downloading notify-0.3.1.tar.gz (10 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


ModuleNotFoundError: No module named 'notify'

In [30]:
df_submission = pd.DataFrame()
df_submission['id'] = df_test['id']
for index, row in df_test.iterrows():
    value = predict_sentiment(row['Review'], model, tokenizer, device)
    df_submission.at[index, 'overall'] = value
#df_submission['overall'] = predict_sentiment(df_test['Review'], model, tokenizer, device)
print(df_submission)
df_submission.to_csv(f"submitKaggle{epoch}E.csv", index = False)
with open(f"submitKaggle{epoch}E.csv", 'rb') as file:
  content = file.read()
url = f'https://api.github.com/repos/{username}/{repository}/cw2/results/bertKaggle{epoch}E.csv'
headers = {
  'Authorization': f'token {repoToken}',
  'Content-Type': 'application/json'
}
payload = {
  'message': 'Upload file',
  'content': content.decode('utf-8')
}
response = requests.put(url, headers=headers, json=payload)
print(response.status_code)

NameError: name 'df_test' is not defined

In [None]:
df_submission['overall'] = df_submission['overall'] + 1
df_submission

In [None]:
df_submission.to_csv("submit1Epoch.csv", index = False)

In [None]:
torch.save(model.state_dict(), "BERTEpoch1.bin")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive")


In [None]:
model = torch.load("/content/drive/MyDrive/data/bert_classifier.pth")
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [None]:
# Test sentiment prediction
test_text = "The movie was great and I really enjoyed the performances of the actors."
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print("The movie was great and I really enjoyed the performances of the actors.")
print(f"Predicted sentiment: {sentiment}")

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/data/test.csv')

In [None]:
df_test.head()

In [None]:
df_submission = pd.DataFrame()

In [None]:
df_submission = pd.DataFrame()
df_submission['id'] = df_test['id']

for index, row in df_test.iterrows():
    # Get the value from df
    value = predict_sentiment(row['Review'], model, tokenizer, device)
    # Update the corresponding row in df_submission
    df_submission.at[index, 'overall'] = value

df_submission['overall'] = predict_sentiment(df_test['Review'], model, tokenizer, device)

In [None]:
for index, row in df_test.iterrows():
    # Get the value from df
    value = predict_sentiment(row['Review'], model, tokenizer, device)
    # Update the corresponding row in df_submission
    df_submission.at[index, 'overall'] = value

In [None]:
df_submission['overall'] = predict_sentiment(df_test['Review'], model, tokenizer, device)

In [None]:
from google.colab import runtime
runtime.unassign()