Data Preprocessing and BERT Embeddings

In [1]:
import re
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch

# Load dataset
file_path = 'C:/Users/gangi/OneDrive/Desktop/DL/HateSpeechData.csv'
data = pd.read_csv(file_path)

# Preprocessing function to clean tweet text
def preprocess_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    # Remove special characters and numbers
    tweet = re.sub(r'\@\w+|\#','', tweet)
    tweet = re.sub(r'[^A-Za-z\s]', '', tweet)
    # Convert to lowercase
    tweet = tweet.lower().strip()
    return tweet

# Apply preprocessing to the tweet column
data['clean_tweet'] = data['tweet'].apply(preprocess_tweet)

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embeddings(tweet):
    inputs = tokenizer(tweet, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Get embeddings for all tweets
data['embedding'] = data['clean_tweet'].apply(get_bert_embeddings)

# Save the preprocessed data with embeddings to avoid recomputation
data.to_pickle('preprocessed_hate_speech_data.pkl')

data.head()


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,clean_tweet,embedding
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,rt as a woman you shouldnt complain about cle...,"[-0.15854223, 0.34848896, -0.01928288, -0.0436..."
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,rt boy dats coldtyga dwn bad for cuffin dat h...,"[-0.23984918, 0.060871646, 0.46534136, -0.0900..."
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,rt dawg rt you ever fuck a bitch and she sta...,"[0.22387727, 0.5524405, 0.38878042, -0.2973610..."
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,rt she look like a tranny,"[0.18744457, -0.19207066, -0.1701945, -0.05402..."
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,rt the shit you hear about me might be true o...,"[0.37754634, 0.17796738, 0.2811793, 0.08011711..."


Graph Construction and GNN Implementation

In [20]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

# Load preprocessed data
data = pd.read_pickle('preprocessed_hate_speech_data.pkl')

# Extract embeddings and labels
embeddings = np.vstack(data['embedding'].values)
labels = data['class'].values

# Normalize embeddings
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Calculate cosine similarity in a sparse manner
threshold = 0.7  # Adjust this threshold as needed
rows, cols, values = [], [], []

for i in range(len(embeddings)):
    for j in range(i + 1, len(embeddings)):
        similarity = cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
        if similarity > threshold:
            rows.append(i)
            cols.append(j)
            values.append(similarity)

# Convert similarity data to sparse matrix format
adjacency = sparse.coo_matrix((values, (rows, cols)), shape=(len(embeddings), len(embeddings)))

# Convert adjacency matrix to edge index for PyTorch
edges = adjacency.nonzero()
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

# Prepare data for PyTorch
x = torch.tensor(embeddings, dtype=torch.float)
y = torch.tensor(labels, dtype=torch.long)

# GNN Model
class GNN(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GNN, self).__init__()
        self.fc1 = torch.nn.Linear(in_channels, 128)
        self.fc2 = torch.nn.Linear(128, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return F.log_softmax(x, dim=1)

# Model Initialization
model = GNN(in_channels=x.size(1), out_channels=3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Training and Evaluation functions (same as before)

# Training loop
for epoch in range(200):
    loss = train()
    acc = test()
    print(f'Epoch: {epoch}, Loss: {loss:.4f}, Accuracy: {acc:.4f}')
