Step 1:Load and Explore the dataset

In [2]:
import pandas as pd

# Load the dataset
file_path = 'C:/Users/gangi/OneDrive/Desktop/DL/HateSpeechData.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(df.head())


   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


Step 2: Preprocess the Text Data

In [3]:
# Adjusting the column names accordingly
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
    return text

# Replace 'text' with the actual column name, e.g., 'tweet'
df['clean_text'] = df['tweet'].apply(preprocess_text)



Step 3: Generate BERT Embeddings

In [5]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to get BERT embeddings in batches
def get_bert_embeddings(text_list, batch_size=32):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    return np.concatenate(embeddings, axis=0)

# Get the list of clean texts
text_list = df['clean_text'].tolist()

# Generate BERT embeddings in batches
bert_embeddings = get_bert_embeddings(text_list)

# Add embeddings to the dataframe
df['bert_embeddings'] = list(bert_embeddings)


In [7]:
import networkx as nx
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Example data for demonstration purposes
# Replace this with your actual data
df = {
    'bert_embeddings': np.random.rand(10, 768)
}

try:
    # Create a similarity matrix
    embeddings = np.stack(df['bert_embeddings'])  # No need to call to_numpy()
    similarity_matrix = cosine_similarity(embeddings)

    # Create a NetworkX graph
    threshold = 0.7  # Similarity threshold
    graph = nx.Graph()
    num_nodes = embeddings.shape[0]
    graph.add_nodes_from(range(num_nodes))

    # Add edges based on the similarity matrix
    edges = np.where(similarity_matrix > threshold)
    for i, j in zip(edges[0], edges[1]):
        if i != j:  # Avoid self-loops
            graph.add_edge(i, j)

    # Add node features
    for i in range(num_nodes):
        graph.nodes[i]['feat'] = torch.tensor(embeddings[i], dtype=torch.float32)

    print(graph)
    for node, data in graph.nodes(data=True):
        print(f"Node {node}: {data['feat']}")

except FileNotFoundError as fnfe:
    print(f"FileNotFoundError: {fnfe}")
except Exception as e:
    print(f"Error: {e}")


Graph with 10 nodes and 45 edges
Node 0: tensor([4.5314e-01, 9.0097e-01, 1.1766e-01, 1.1970e-01, 2.1521e-01, 8.5322e-01,
        8.4628e-01, 1.8332e-01, 3.0837e-01, 1.1184e-01, 6.4425e-01, 1.2530e-01,
        4.9781e-01, 8.3098e-01, 2.9342e-01, 4.0047e-01, 9.2095e-01, 3.6572e-01,
        7.0703e-01, 3.2878e-01, 8.5478e-01, 2.0203e-01, 4.5773e-01, 4.8626e-01,
        7.7082e-01, 5.9925e-01, 5.7031e-01, 6.7019e-01, 2.8277e-01, 4.2137e-01,
        4.4938e-01, 2.8594e-01, 7.9121e-01, 4.1153e-01, 3.7438e-01, 3.7093e-01,
        9.0856e-01, 6.4203e-02, 6.7155e-01, 8.8084e-01, 5.3231e-01, 7.0032e-01,
        9.8155e-01, 3.0909e-01, 3.2522e-01, 4.2105e-01, 7.5137e-01, 8.3201e-02,
        4.9735e-02, 5.9094e-02, 7.5642e-01, 8.5949e-01, 5.8348e-01, 6.8330e-01,
        3.5010e-01, 7.4253e-01, 2.6020e-01, 1.3432e-01, 7.1354e-01, 3.2121e-01,
        4.4568e-02, 5.4809e-01, 4.7916e-01, 6.3259e-01, 3.6562e-01, 4.5954e-01,
        7.2247e-01, 4.4104e-01, 5.5648e-01, 1.1716e-01, 1.1597e-01, 1.1848e-01,

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import networkx as nx
from torch.optim import Adam

class SimpleGCNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(SimpleGCNLayer, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)

    def forward(self, x, adj):
        x = torch.mm(adj, x)
        return self.linear(x)

class GNNClassifier(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GNNClassifier, self).__init__()
        self.conv1 = SimpleGCNLayer(in_feats, h_feats)
        self.conv2 = SimpleGCNLayer(h_feats, num_classes)
    
    def forward(self, x, adj):
        x = self.conv1(x, adj)
        x = F.relu(x)
        x = self.conv2(x, adj)
        return x

# Example data for demonstration purposes
# Replace this with your actual data
df = {
    'bert_embeddings': np.random.rand(10, 768),
    'label': np.random.randint(2, size=10)
}

# Create a similarity matrix
embeddings = np.stack(df['bert_embeddings'])
similarity_matrix = cosine_similarity(embeddings)

# Create a NetworkX graph
threshold = 0.7  # Similarity threshold
graph = nx.Graph()
num_nodes = embeddings.shape[0]
graph.add_nodes_from(range(num_nodes))

# Add edges based on the similarity matrix
edges = np.where(similarity_matrix > threshold)
for i, j in zip(edges[0], edges[1]):
    if i != j:  # Avoid self-loops
        graph.add_edge(i, j)

# Convert NetworkX graph to adjacency matrix
adj = nx.adjacency_matrix(graph).todense()
adj = torch.tensor(adj, dtype=torch.float32)

# Node features and labels
x = torch.tensor(embeddings, dtype=torch.float32)
labels = torch.tensor(df['label'], dtype=torch.long)

# Assume binary classification (abusive or not)
num_classes = 2
in_feats = embeddings.shape[1]
h_feats = 16
model = GNNClassifier(in_feats, h_feats, num_classes)

# Define loss function and optimizer
optimizer = Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(x, adj)
    loss = loss_fn(out, labels)
    loss.backward()
    optimizer.step()
    
    print(f'Epoch {epoch} | Loss: {loss.item()}')


Epoch 0 | Loss: 0.510538637638092
Epoch 1 | Loss: 148.2358856201172
Epoch 2 | Loss: 39.42931365966797
Epoch 3 | Loss: 0.7733098268508911
Epoch 4 | Loss: 0.7673842310905457
Epoch 5 | Loss: 0.761530876159668
Epoch 6 | Loss: 0.7557340860366821
Epoch 7 | Loss: 0.7499896287918091
Epoch 8 | Loss: 0.7442975640296936
Epoch 9 | Loss: 0.7386592626571655
Epoch 10 | Loss: 0.7330777049064636
Epoch 11 | Loss: 0.7275556921958923
Epoch 12 | Loss: 0.7220960855484009
Epoch 13 | Loss: 0.7167016863822937
Epoch 14 | Loss: 0.7113752365112305
Epoch 15 | Loss: 0.706119179725647
Epoch 16 | Loss: 0.7009358406066895
Epoch 17 | Loss: 0.6958271265029907
Epoch 18 | Loss: 0.690794825553894
Epoch 19 | Loss: 0.685840904712677
Epoch 20 | Loss: 0.6809664964675903
Epoch 21 | Loss: 0.6761729121208191
Epoch 22 | Loss: 0.6714615821838379
Epoch 23 | Loss: 0.6668331027030945
Epoch 24 | Loss: 0.6622886657714844
Epoch 25 | Loss: 0.6578283905982971
Epoch 26 | Loss: 0.6534531712532043
Epoch 27 | Loss: 0.6491633057594299
Epoch 28 

In [19]:
def evaluate(model, x, adj, labels):
    model.eval()
    with torch.no_grad():
        logits = model(x, adj)
        _, predicted = torch.max(logits, 1)
        correct = (predicted == labels).sum().item()
        accuracy = correct / labels.size(0)
    return accuracy

train_accuracy = evaluate(model, x, adj, labels)
print(f'Train Accuracy: {train_accuracy:.4f}')

Train Accuracy: 0.8000


In [20]:
def predict(model, x, adj):
    model.eval()
    with torch.no_grad():
        logits = model(x, adj)
        _, predicted = torch.max(logits, 1)
    return predicted
new_data = {
    'bert_embeddings': np.random.rand(5, 768)  # Example new data
}

new_embeddings = np.stack(new_data['bert_embeddings'])
new_x = torch.tensor(new_embeddings, dtype=torch.float32)

# Create similarity matrix for new data
new_similarity_matrix = cosine_similarity(new_embeddings)

# Create a new NetworkX graph for new data
new_graph = nx.Graph()
new_num_nodes = new_embeddings.shape[0]
new_graph.add_nodes_from(range(new_num_nodes))

# Add edges based on the similarity matrix
new_edges = np.where(new_similarity_matrix > threshold)
for i, j in zip(new_edges[0], new_edges[1]):
    if i != j:  # Avoid self-loops
        new_graph.add_edge(i, j)

# Convert NetworkX graph to adjacency matrix for new data
new_adj = nx.adjacency_matrix(new_graph).todense()
new_adj = torch.tensor(new_adj, dtype=torch.float32)

# Predict labels for new data
predictions = predict(model, new_x, new_adj)
print("Predicted labels for new data:")
print(predictions.numpy())

Predicted labels for new data:
[1 1 1 1 1]


In [22]:
def predict(model, x, adj):
    model.eval()
    with torch.no_grad():
        logits = model(x, adj)
        probabilities = F.softmax(logits, dim=1)
        _, predicted = torch.max(logits, 1)
    return predicted, probabilities

new_data = {
    'bert_embeddings': np.random.rand(5, 768)  # Example new data
}

new_embeddings = np.stack(new_data['bert_embeddings'])
new_x = torch.tensor(new_embeddings, dtype=torch.float32)

# Create similarity matrix for new data
new_similarity_matrix = cosine_similarity(new_embeddings)

# Create a new NetworkX graph for new data
new_graph = nx.Graph()
new_num_nodes = new_embeddings.shape[0]
new_graph.add_nodes_from(range(new_num_nodes))

# Add edges based on the similarity matrix
new_edges = np.where(new_similarity_matrix > threshold)
for i, j in zip(new_edges[0], new_edges[1]):
    if i != j:  # Avoid self-loops
        new_graph.add_edge(i, j)

# Convert NetworkX graph to adjacency matrix for new data
new_adj = nx.adjacency_matrix(new_graph).todense()
new_adj = torch.tensor(new_adj, dtype=torch.float32)

# Predict labels and probabilities for new data
predictions, probabilities = predict(model, new_x, new_adj)

# Print predictions and probabilities
print("Predicted labels for new data:")
print(predictions.numpy())

print("Predicted probabilities for new data:")
print(probabilities.numpy())

Predicted labels for new data:
[1 1 1 1 1]
Predicted probabilities for new data:
[[0.22208202 0.777918  ]
 [0.22604989 0.77395016]
 [0.22712915 0.77287084]
 [0.23390765 0.7660923 ]
 [0.2153957  0.7846043 ]]
