In [37]:
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel, AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

In [2]:
df = pd.read_csv('/content/Train.csv')
pd.set_option('display.max_columns', None)

In [3]:
df

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.000000
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.000000
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.000000
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.000000
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.000000
...,...,...,...,...
9996,IU0TIJDI,Living in a time where the sperm I used to was...,1.0,1.000000
9997,WKKPCJY6,<user> <user> In spite of all measles outbrea...,1.0,0.666667
9998,ST3A265H,Interesting trends in child immunization in Ok...,0.0,1.000000
9999,6Z27IJGD,CDC Says Measles Are At Highest Levels In Deca...,0.0,1.000000


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   10001 non-null  object 
 1   safe_text  10001 non-null  object 
 2   label      10000 non-null  float64
 3   agreement  9999 non-null   float64
dtypes: float64(2), object(2)
memory usage: 312.7+ KB


In [8]:
df.loc[df['label'].isna(), 'label'] = 0

In [19]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
def preprocess_text(text):
    pattern = r'http\S+|www.\S+|[@#]\w+|[^\w\s]|_|\d+|[^\x00-\x7F]'
    text = re.sub(pattern, '', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text

df['clean_text'] = df['safe_text'].apply(preprocess_text)

In [20]:
df

Unnamed: 0,tweet_id,safe_text,label,agreement,clean_text
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.000000,amp big homie meanboy stegman st url
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.000000,im thinking devoting career proving autism isn...
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.000000,vaccines vaccinate child
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.000000,mean immunize kid wont secretly kill years lin...
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.000000,thanks user catch performing la nuit nyc st av...
...,...,...,...,...,...
9996,IU0TIJDI,Living in a time where the sperm I used to was...,1.0,1.000000,living time sperm used waste jenny mccarthy do...
9997,WKKPCJY6,<user> <user> In spite of all measles outbrea...,1.0,0.666667,user user spite measles outbreaks judge mi thr...
9998,ST3A265H,Interesting trends in child immunization in Ok...,0.0,1.000000,interesting trends child immunization oklahoma...
9999,6Z27IJGD,CDC Says Measles Are At Highest Levels In Deca...,0.0,1.000000,cdc says measles highest levels decades url re...


In [23]:
texts = df['clean_text'].values
labels = df['label'].values

In [25]:
labels

array([ 0.,  1., -1., ...,  0.,  0.,  1.])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [27]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [28]:
def tokenize_data(texts, labels, tokenizer, max_length=128):

  input_ids = []
  attention_masks = []

  for text in texts:
      encoded = tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=max_length,
          padding='max_length',
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt'
      )

      input_ids.append(encoded['input_ids'])
      attention_masks.append(encoded['attention_mask'])

  # Convert lists to tensors
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)

  return input_ids, attention_masks, labels

In [29]:
train_input_ids, train_attention_masks, train_labels = tokenize_data(X_train, y_train, tokenizer)
test_input_ids, test_attention_masks, test_labels = tokenize_data(X_test, y_test, tokenizer)

In [31]:
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)

In [32]:
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)

In [33]:
# Create DataLoader for test set
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=16)

In [36]:
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
from transformers import BertModel
class BertForRegression(torch.nn.Module):
  def __init__(self):
      super(BertForRegression, self).__init__()
      self.bert = BertModel.from_pretrained('bert-base-uncased')
      self.regressor = torch.nn.Linear(self.bert.config.hidden_size, 1)

  def forward(self, input_ids, attention_mask=None, token_type_ids=None):
      outputs = self.bert(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids)
      pooled_output = outputs.pooler_output
      regression_output = self.regressor(pooled_output)
      return regression_output

model = BertForRegression()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [39]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForRegression(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [40]:
# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Use Mean Squared Error Loss for regression
loss_fn = torch.nn.MSELoss()



In [43]:
# Training loop
epochs = 3

for epoch in range(epochs):
  print(f"Epoch {epoch + 1}/{epochs}")

  # Set model to training mode
  model.train()

  total_loss = 0
  for step, batch in enumerate(train_dataloader):
      # Move batch to GPU
      batch_input_ids, batch_attention_mask, batch_labels = tuple(b.to(device) for b in batch)

      # Zero gradients
      model.zero_grad()

      # Perform forward pass
      outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
      loss = loss_fn(outputs.squeeze(), batch_labels.float())  # Use float labels for regression
      total_loss += loss.item()

      # Perform backward pass
      loss.backward()

      # Clip the norm of the gradients to 1.0
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Update parameters and take a step using the computed gradient
      optimizer.step()

  avg_train_loss = total_loss / len(train_dataloader)
  print(f"Average training loss: {avg_train_loss:.2f}")

# Evaluation loop
model.eval()


Epoch 1/3
Average training loss: 0.32
Epoch 2/3
Average training loss: 0.23
Epoch 3/3
Average training loss: 0.16


TypeError: iteration over a 0-d array

In [66]:

# df_testing.info()
df_testing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5177 entries, 0 to 5176
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   5177 non-null   object
 1   safe_text  5177 non-null   object
dtypes: object(2)
memory usage: 81.0+ KB


In [73]:
# Test dataset prediction (optional, replace with your own test dataset)
def tokenize_data2(texts, tokenizer, max_length=128):
  input_ids = []
  attention_masks = []

  for text in texts:
      encoded_dict = tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=max_length,
          padding='max_length',
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt'
      )
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])

  # Convert lists to tensors
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)

  return input_ids, attention_masks

df_testing = pd.read_csv('/content/Test.csv')
df_testing.loc[df_testing['safe_text'].isna(), 'safe_text'] = 'pula mea'
df_testing['clean_text'] = df_testing['safe_text'].apply(preprocess_text)
test_texts = df_testing['clean_text'].values
test_ids, test_masks = tokenize_data2(test_texts ,tokenizer)

test_data = TensorDataset(test_ids, test_masks)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=16)

submission_predictions = []

with torch.no_grad():
  for batch in test_dataloader:
      batch_input_ids, batch_attention_mask = tuple(b.to(device) for b in batch)

      # Perform forward pass
      outputs = model(batch_input_ids, attention_mask=batch_attention_mask)

      # Get the predicted continuous scores
      batch_predictions = outputs.squeeze().cpu().numpy()

      # Append the predictions
      submission_predictions.extend(batch_predictions)

# Create a DataFrame with the ids and predicted labels
submission_df = pd.DataFrame({'id': df_testing['tweet_id'], 'label': submission_predictions})

# Save the predictions to a CSV file
submission_df.to_csv('submission.csv', index=False)

print("Submission file saved as 'submission.csv'.")

Submission file saved as 'submission.csv'.


In [74]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>