In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np

In [2]:
import pandas as pd
dataframe = pd.read_csv("training.1600000.processed.noemoticon.csv", header=None)

In [3]:
dataframe = dataframe.drop(columns=[2, 3, 4])
dataframe

Unnamed: 0,0,1,5
0,0,1467810672,is upset that he can't update his Facebook by ...
1,0,1467810917,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,my whole body feels itchy and like its on fire
3,0,1467811193,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,@Kwesidei not the whole crew
...,...,...,...
1599994,4,2193601966,Just woke up. Having no school is the best fee...
1599995,4,2193601969,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,2193601991,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,2193602064,Happy 38th Birthday to my boo of alll time!!! ...


In [4]:
dataframe.columns = ["sentiment", "id", "text"]

In [5]:
dataframe = dataframe.sample(n=10000)
dataframe

Unnamed: 0,sentiment,id,text
388956,0,2054349109,Open laps of race car driving school - I am 3r...
1162062,4,1979559733,@DwightHoward Represent the East! Beat the Lak...
1511620,4,2175119636,@_anoushka_ But that's the whole point of Twit...
311957,0,2001414232,i miss those moments when all i ever thought o...
1155487,4,1979015630,@davetran :O where did he get his drivers li...
...,...,...,...
1283220,4,2001917395,@evliving OMG...you are so funny...I've said ...
47693,0,1677694405,"@Tygatyga uh, i think you should have been pla..."
1337788,4,2017818673,Morning pumpkin doodles hope everyone has a s...
1569888,4,2188487053,"@imogenheap I have to say, tonight, I am lovin..."


In [6]:
dataframe["sentiment"].value_counts()

4    5015
0    4985
Name: sentiment, dtype: int64

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
dataframe = dataframe.to_numpy()

In [9]:
dataframe[1]

array([4, 1979559733,
       '@DwightHoward Represent the East! Beat the Lakers!!! '],
      dtype=object)

In [10]:
sentiment = np.array([x[0] for x in dataframe])
sentiment

array([0, 4, 4, ..., 4, 4, 4])

In [11]:
label_map = {0: 0, 4: 1}
labels = [label_map[label] for label in sentiment]
labels = torch.tensor(labels, dtype=torch.long)
labels

tensor([0, 1, 1,  ..., 1, 1, 1])

In [None]:
embedding = [tokenizer(x[2], return_tensors='pt', padding='max_length', max_length=128) for x in dataframe]
embedding

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(embedding, labels, test_size=0.2, random_state=42)

In [14]:
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
input_ids_train = torch.stack([item['input_ids'][0] for item in X_train])
attention_mask_train  = torch.stack([item['attention_mask'][0] for item in X_train])
token_type_ids_train  = torch.stack([item['token_type_ids'][0] for item in X_train])

In [16]:
input_ids_test = torch.stack([item['input_ids'][0] for item in X_test])
attention_mask_test  = torch.stack([item['attention_mask'][0] for item in X_test])
token_type_ids_test  = torch.stack([item['token_type_ids'][0] for item in X_test])

training bert for baseline accuracy

In [17]:
from torch.utils.data import DataLoader, TensorDataset

In [18]:
dataset = TensorDataset(input_ids_train, attention_mask_train, y_train)
train_dataloader = DataLoader(dataset, batch_size=100)

In [19]:
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=.001)
loss_fn = torch.nn.CrossEntropyLoss()


In [20]:
num_epochs = 5

bert_model.train()

for epoch in range(num_epochs):
    total_loss = 0
    
    for input_ids, attention_mask, label in train_dataloader:

        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


Epoch 1/5, Loss: 0.7298
Epoch 2/5, Loss: 0.7082
Epoch 3/5, Loss: 0.7088
Epoch 4/5, Loss: 0.7085
Epoch 5/5, Loss: 0.7059


In [21]:
test_dataset = TensorDataset(input_ids_test, attention_mask_test, y_test)
test_dataloader = DataLoader(test_dataset, batch_size=100)

In [43]:
from sklearn.metrics import f1_score

In [44]:
bert_model.eval()
total_eval_accuracy = 0
predictions = []
true_labels = []

with torch.no_grad():
    for input_ids, attention_mask, label in test_dataloader:
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        preds = torch.argmax(logits, dim=1)
        accuracy = (preds == label).float().mean()
        
        total_eval_accuracy += accuracy.item()
        predictions.extend(preds.numpy())
        true_labels.extend(label.numpy())


avg_accuracy = total_eval_accuracy / len(test_dataloader)
print(f"Average Accuracy: {avg_accuracy:.4f}")

f1 = f1_score(true_labels, predictions, average='weighted')
print(f"F1 Score: {f1:.4f}")

Average Accuracy: 0.5075
F1 Score: 0.3417


bert with cnn

In [23]:
import torch.nn as nn
import torch.nn.functional as F

In [24]:
class CNN(nn.Module):
    def __init__(self, embedding_dim):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(embedding_dim, 128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(256, 512, kernel_size=3, padding=1)
        
        self.flatten = nn.Flatten()
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        
        self.fc1 = nn.Linear(8192, 512)
        self.fc2 = nn.Linear(512, 2)
        
        self.dropout = nn.Dropout(0.5)


    def forward(self, x):
        x = self.conv1(x)
        x = nn.ReLU()(x)
        x = self.pool(x)
        
        x = self.conv2(x)
        x = nn.ReLU()(x)
        x = self.pool(x)
        
        x = self.conv3(x)
        x = nn.ReLU()(x)
        x = self.pool(x)
        
        x = self.flatten(x)
        
        x = self.fc1(x)
        x = nn.ReLU()(x)
        x = self.dropout(x)
        
        x = self.fc2(x)
        
        return x



cnn_model = CNN(768)

In [25]:
num_epochs = 5

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn_model.parameters(), lr=0.001)
cnn_model.train()

for epoch in range(num_epochs):
    total_loss = 0
    
    for input_ids, attention_mask, label in train_dataloader:

        with torch.no_grad():
            bert_outputs = bert_model.bert(input_ids=input_ids, attention_mask=attention_mask)
            bert_embeddings = bert_outputs.last_hidden_state
            bert_embeddings = bert_embeddings.permute(0, 2, 1)

        optimizer.zero_grad()

        outputs = cnn_model(bert_embeddings)
        loss = criterion(outputs, label)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print("epoch done")
    
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

epoch done
Epoch 1/5, Loss: 0.6955
epoch done
Epoch 2/5, Loss: 0.6932
epoch done
Epoch 3/5, Loss: 0.6932
epoch done
Epoch 4/5, Loss: 0.6932
epoch done
Epoch 5/5, Loss: 0.6932


In [45]:
total_eval_accuracy = 0
cnn_model.eval()
predictions = []
true_labels = []


with torch.no_grad():
    for input_ids, attention_mask, label in test_dataloader:
        bert_outputs = bert_model.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_embeddings = bert_outputs.last_hidden_state
        bert_embeddings = bert_embeddings.permute(0, 2, 1)
        cnn_logits = cnn_model(bert_embeddings)

        preds = torch.argmax(cnn_logits, dim=1)
        
        accuracy = (preds == label).float().mean()        
        total_eval_accuracy += accuracy.item()

        predictions.extend(preds.numpy())
        true_labels.extend(label.numpy())

avg_accuracy = total_eval_accuracy / len(test_dataloader)
print(f"Average Accuracy: {avg_accuracy:.4f}")

f1 = f1_score(true_labels, predictions, average='weighted')
print(f"F1 Score: {f1:.4f}")

Average Accuracy: 0.4925
F1 Score: 0.3250


bert with fc

In [36]:
class FCN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(FCN, self).__init__()
        
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size // 2, 2)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)
        
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        
        return x

fcn_model = FCN(98304, 1000)

In [39]:
num_epochs = 5

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn_model.parameters(), lr=0.001)
cnn_model.train()

for epoch in range(num_epochs):
    total_loss = 0
    
    for input_ids, attention_mask, label in train_dataloader:

        with torch.no_grad():
            bert_outputs = bert_model.bert(input_ids=input_ids, attention_mask=attention_mask)
            bert_embeddings = bert_outputs.last_hidden_state
            bert_embeddings = bert_embeddings.permute(0, 2, 1).reshape(bert_embeddings.size(0), -1)

        optimizer.zero_grad()

        outputs = fcn_model(bert_embeddings)
        loss = criterion(outputs, label)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print("epoch done")
    
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

epoch done
Epoch 1/5, Loss: 0.6932
epoch done
Epoch 2/5, Loss: 0.6932
epoch done
Epoch 3/5, Loss: 0.6932
epoch done
Epoch 4/5, Loss: 0.6932
epoch done
Epoch 5/5, Loss: 0.6932


In [42]:
total_eval_accuracy = 0
fcn_model.eval()
predictions = []
true_labels = []


with torch.no_grad():
    for input_ids, attention_mask, label in test_dataloader:
        bert_outputs = bert_model.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_embeddings = bert_outputs.last_hidden_state
        
        bert_embeddings = bert_embeddings.view(bert_embeddings.size(0), -1)
        
        fcn_logits = fcn_model(bert_embeddings)

        preds = torch.argmax(fcn_logits, dim=1)
        
        accuracy = (preds == label).float().mean()        
        total_eval_accuracy += accuracy.item()

        predictions.extend(preds.numpy())
        true_labels.extend(label.numpy())


avg_accuracy = total_eval_accuracy / len(test_dataloader)
print(f"Average Accuracy: {avg_accuracy:.4f}")

f1 = f1_score(true_labels, predictions, average='weighted')
print(f"F1 Score: {f1:.4f}")

Average Accuracy: 0.5075
F1 Score: 0.3417
