# BERT with TFIDF

In [None]:
!pip install transformers



In [None]:
import pandas as pd

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer

# Specify the path to your vocab file
import requests

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:

def tokenize(text):
  tokens = nltk.word_tokenize(text)
  return tokens


In [None]:
# Replace 'your_file.csv' with the actual file path
file_path = 'https://raw.githubusercontent.com/vaamps/cyberbullying-detection/main/datasets/output_sentiment.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)
# Replace 'your_text_column' with the actual text column name
text_column = df['Text']
# vocab = pd.read_csv('filtered_vocab.txt', header=None)[0].tolist()

In [None]:
# URL of the vocab file
vocab_file_path = 'https://raw.githubusercontent.com/vaamps/cyberbullying-detection/main/datasets/negative_words.txt'

# Read the vocab file and create a set of non-stopwords
response = requests.get(vocab_file_path)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    vocab_words = {word.strip() for word in response.text.split() if word.strip()}
    print(vocab_words)
else:
    print(f"Failed to fetch the vocabulary file. Status code: {response.status_code}")

# Remove stopwords and create a new vocabulary
filtered_vocabulary = set(word.lower() for word in vocab_words if word.lower() not in stopwords.words('english'))

# Create a TfidfVectorizer
#tfidfvectorizer = TfidfVectorizer(max_features=10000, min_df=5, max_df=0.85, ngram_range=(1, 2))
tfidfvectorizer = TfidfVectorizer(vocabulary = filtered_vocabulary, max_features=511, min_df=5, max_df=0.85, ngram_range=(1, 2))

# Fit and transform the text column to get the BoW features
tfidf_features = tfidfvectorizer.fit_transform(text_column)
feature_names = tfidfvectorizer.get_feature_names_out()

# Create a DataFrame using the CSR matrix
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_features, columns=feature_names)





In [None]:
from sklearn.feature_selection import SelectKBest, chi2

k = 511  # Adjust this to your needs

# Create the SelectKBest with the chi-squared metric
chi2_selector = SelectKBest(chi2, k=k)
y = df['oh_label']
# Fit the selector to your data
X_kbest = chi2_selector.fit_transform(tfidf_df, y)

# X_kbest now contains the top k features from your original TF-IDF vectors

In [None]:
feature_names = chi2_selector.get_feature_names_out()

In [None]:
tfidf_df = pd.DataFrame.sparse.from_spmatrix(X_kbest, columns=feature_names)

In [None]:
tfidf_df.shape


In [None]:
tfidf_df

Unnamed: 0,182,187,86,abandon,abandoned,abandoning,abandonment,abandons,abducted,abduction,...,worsens,worst,worthless,wreck,wrong,wronged,wtf,wth,zealot,zealots
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df

Unnamed: 0,id,Text,Annotation,oh_label,source,Sentiment_Score
0,0,good idea im going to walk away from wp now be...,none,0.0,wikipedia,9.0
1,1,will this tart be the deconstructing of the sa...,sexism,1.0,twitter,0.0
2,2,hey there something tells me that my poor wi...,none,0.0,wikipedia,-2.0
3,3,making the majority into the minority thou...,none,0.0,wikipedia,2.0
4,4,the level of experience of an editor does no...,none,0.0,wikipedia,-3.0
...,...,...,...,...,...,...
99995,99995,the falling man weve got a lot of pictures ...,none,0.0,wikipedia,-3.0
99996,99996,rt no am not sexist but is diezani alison tha...,none,0.0,twitter,0.0
99997,99997,cant win hahahahahahahahahah did you thin...,toxicity,1.0,wikipedia,3.0
99998,99998,conflict of interest section i didnt see ...,none,0.0,wikipedia,-3.0


In [None]:

# Concatenate the original DataFrame with the new BoW DataFrame
df = pd.concat([df, tfidf_df], axis=1)
df


Unnamed: 0,id,Text,Annotation,oh_label,source,Sentiment_Score,182,187,86,abandon,...,worsens,worst,worthless,wreck,wrong,wronged,wtf,wth,zealot,zealots
0,0,good idea im going to walk away from wp now be...,none,0.0,wikipedia,9.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,will this tart be the deconstructing of the sa...,sexism,1.0,twitter,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,hey there something tells me that my poor wi...,none,0.0,wikipedia,-2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,making the majority into the minority thou...,none,0.0,wikipedia,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,the level of experience of an editor does no...,none,0.0,wikipedia,-3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,the falling man weve got a lot of pictures ...,none,0.0,wikipedia,-3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,99996,rt no am not sexist but is diezani alison tha...,none,0.0,twitter,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,99997,cant win hahahahahahahahahah did you thin...,toxicity,1.0,wikipedia,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,99998,conflict of interest section i didnt see ...,none,0.0,wikipedia,-3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# df.to_csv('output_bow.csv')
new_df = df.iloc[:, 5:]
new_df.head()


Unnamed: 0,Sentiment_Score,182,187,86,abandon,abandoned,abandoning,abandonment,abandons,abducted,...,worsens,worst,worthless,wreck,wrong,wronged,wtf,wth,zealot,zealots
0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
new_df

Unnamed: 0,Sentiment_Score,182,187,86,abandon,abandoned,abandoning,abandonment,abandons,abducted,...,worsens,worst,worthless,wreck,wrong,wronged,wtf,wth,zealot,zealots
0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# All imports for task 2
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
X = new_df
y = df['oh_label']
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
print(X.shape[0])
print(y.shape[0])


100000
100000


In [None]:
X.shape


(100000, 2433)

In [None]:
from transformers import BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import accuracy_score


In [None]:
X_tensor = torch.tensor(X.values).float()
y_tensor = torch.tensor(y.values)
dataset = TensorDataset(X_tensor, y_tensor)


In [None]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


In [None]:
batch_size = 32  # You can adjust this based on your GPU memory

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=batch_size)


In [None]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',  # Use any BERT variant
    num_labels = 2  #, Number of output labels, e.g., 2 for binary classification
    #output_attentions = False,
    #output_hidden_states = False,
)

optimizer = AdamW(model.parameters(), lr=2e-5)  # You can tune the learning rate


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = "cpu"

In [None]:
from transformers import get_linear_schedule_with_warmup
import torch.nn.functional as F


# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

# Total number of training steps is [number of batches] x [number of epochs]
total_steps = len(train_loader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

# Training loop
for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_loss = 0
    model.train()

    for step, batch in enumerate(train_loader):
        b_input_ids = batch[0].to(device)
        b_labels = batch[1].to(device)

        model.zero_grad()

        outputs = model(b_input_ids, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)
    print("  Average training loss: {0:.2f}".format(avg_train_loss))

    print("")
    print("Running Validation...")

    model.eval()

    total_eval_accuracy = 0

    for batch in validation_loader:
        b_input_ids = batch[0].to(device)
        b_labels = batch[1].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids)

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += accuracy_score(np.argmax(logits, axis=1).flatten(), label_ids.flatten())

    avg_val_accuracy = total_eval_accuracy / len(validation_loader)
    print("  Validation Accuracy: {0:.2f}".format(avg_val_accuracy))

print("")
print("Training complete!")


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.



Training...


RuntimeError: ignored

# BERT on its own (BERTForSequenceClassification)

In [1]:
!pip install transformers



In [2]:
import pandas as pd

In [3]:
# Replace 'your_file.csv' with the actual file path
file_path = 'https://raw.githubusercontent.com/vaamps/cyberbullying-detection/main/datasets/output_sentiment.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)
# Replace 'your_text_column' with the actual text column name
text_column = df['Text']
# vocab = pd.read_csv('filtered_vocab.txt', header=None)[0].tolist()

In [4]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
# Specify the path to your vocab file
import requests

# URL of the vocab file
vocab_file_path = 'https://raw.githubusercontent.com/vaamps/cyberbullying-detection/main/datasets/negative_words.txt'

# Read the vocab file and create a set of non-stopwords
response = requests.get(vocab_file_path)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    vocab_words = {word.strip() for word in response.text.split() if word.strip()}
    print(vocab_words)
else:
    print(f"Failed to fetch the vocabulary file. Status code: {response.status_code}")

# Remove stopwords and create a new vocabulary
filtered_vocabulary = set(word.lower() for word in vocab_words if word.lower() not in stopwords.words('english'))



In [6]:
import numpy as np

In [7]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch
import numpy as np
from sklearn.metrics import accuracy_score

In [8]:
print(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

cuda


In [9]:
df['oh_label'] = df['oh_label'].astype(int)

In [10]:


# Assuming df is your DataFrame and 'Text' and 'oh_label' are columns in it
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
encodings = tokenizer(df['Text'].tolist(), truncation=True, padding=True, max_length=512)

# Prepare the dataset
inputs = torch.tensor(encodings['input_ids'])
masks = torch.tensor(encodings['attention_mask'])
labels = torch.tensor(df['oh_label'].values)

# Create the dataset
dataset = TensorDataset(inputs, masks, labels)

# Split the data into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Function to calculate accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, pred_flat)

# Initialize accumulators for overall average calculation
total_train_accuracy = 0
total_val_accuracy = 0

# Training loop
for epoch_i in range(epochs):
    # Training
    model.train()
    total_epoch_train_accuracy = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_attention_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_attention_mask = b_attention_mask.to(device)
        b_labels = b_labels.to(device)

        model.zero_grad()

        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_epoch_train_accuracy += flat_accuracy(logits, label_ids)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_epoch_train_accuracy = total_epoch_train_accuracy / len(train_dataloader)
    total_train_accuracy += avg_epoch_train_accuracy
    print(f"Epoch {epoch_i + 1}")
    print(f"  Training Accuracy: {avg_epoch_train_accuracy}")

    # Validation
    model.eval()
    total_epoch_val_accuracy = 0
    for batch in validation_dataloader:
        b_input_ids, b_attention_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_attention_mask = b_attention_mask.to(device)
        b_labels = b_labels.to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_epoch_val_accuracy += flat_accuracy(logits, label_ids)

    avg_epoch_val_accuracy = total_epoch_val_accuracy / len(validation_dataloader)
    total_val_accuracy += avg_epoch_val_accuracy
    print(f"  Validation Accuracy: {avg_epoch_val_accuracy}")

# Calculate overall average accuracy across all epochs
overall_avg_train_accuracy = total_train_accuracy / epochs
overall_avg_val_accuracy = total_val_accuracy / epochs
print(f"Overall Average Training Accuracy: {overall_avg_train_accuracy}")
print(f"Overall Average Validation Accuracy: {overall_avg_val_accuracy}")

print("Training complete!")


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1
  Training Accuracy: 0.9240625
  Validation Accuracy: 0.92915
Epoch 2
  Training Accuracy: 0.95375
  Validation Accuracy: 0.932
Epoch 3
  Training Accuracy: 0.9764
  Validation Accuracy: 0.93225
Epoch 4
  Training Accuracy: 0.9884875
  Validation Accuracy: 0.93415
Overall Average Training Accuracy: 0.960675
Overall Average Validation Accuracy: 0.9318875
Training complete!


In [11]:
df['oh_label'].unique()

array([0, 1])

In [12]:
torch.save(model.state_dict(), 'bert_sequence_classification_model.pth')


In [13]:
tokenizer.save_pretrained('bert_tokenizer')


('bert_tokenizer/tokenizer_config.json',
 'bert_tokenizer/special_tokens_map.json',
 'bert_tokenizer/vocab.txt',
 'bert_tokenizer/added_tokens.json')

In [14]:
import shutil
shutil.make_archive('/content/bert_tokenizer', 'zip', '/content/bert_tokenizer')

'/content/bert_tokenizer.zip'

Load Model back:

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load('bert_sequence_classification_model.pth'))
model.to(device)  # Make sure to use the same device as before
# Using V100 GPU no High RAM

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert_tokenizer')
