# Setup

In [None]:
import numpy as np
import pandas as pd
import pickle
import torch
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

In [None]:
!tar -xzvf "/content/SBIC.v2.tgz" -C "/content/"

In [None]:
df = pd.read_csv('/content/SBIC.v2.trn.csv')

# Exploratory Data Analysis

In [None]:
counts = df['intentYN'].value_counts(dropna=False)
plt.figure(figsize=(8, 5))
category_counts.plot(kind='bar', color='skyblue')

plt.title('Intent to Offend Counts', fontsize=18)
plt.xlabel('Intent', fontsize=18)
plt.ylabel('Count', fontsize=18)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.savefig('/content/target_category.png', bbox_inches='tight')
plt.show()

# Data Preprocessing

In [None]:
# replace nan values in order to use agglomerative clustering
df['whoTarget'] = df['whoTarget'].fillna(-1)
df['sexReason'] = df['sexReason'].fillna('')
df['offensiveYN'] = df['offensiveYN'].fillna(-1)
df['sexPhrase'] = df['sexPhrase'].fillna('')
df['speakerMinorityYN'] = df['speakerMinorityYN'].fillna(-1)
df['targetMinority'] = df['targetMinority'].fillna('')
df['targetCategory'] = df['targetCategory'].fillna('')
df['targetStereotype'] = df['targetStereotype'].fillna('')

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
to_encode = ['post', 'sexReason',  'sexPhrase', 'targetMinority', 'targetStereotype']

for column in to_encode:
  print(f'Embedding {column}')
  text = df[column]
  embeddings = model.encode(text, show_progress_bar=True)
  np.save(f'/content/embeddings_{column}_train.npy', embeddings)

In [None]:
X = df[['whoTarget', 'intentYN', 'sexYN', 'offensiveYN', 'speakerMinorityYN']]

In [None]:
for embedding_column in to_encode:
  print(f'Loading in embeddings for column {embedding_column}')
  embeddings = np.load(f'/content/embeddings_{embedding_column}_train.npy')
  X[embedding_column] = embeddings.tolist()

In [None]:
array = X.to_numpy()
np.save('data.npy', array)

# Clustering

In [None]:
unpacked_columns = np.array([np.hstack(row[5:]) for row in X])  # Flatten columns 5 onwards
result = np.hstack((X[:, :5], unpacked_columns))  # Combine the first 4 columns with the unpacked data

In [None]:
clustering = AgglomerativeClustering(n_clusters=10, metric='cosine', linkage='average').fit(result) # change line for diff combos like euclidean and ward, or # of clusters

In [None]:
with open('cluster_10.pkl', 'wb') as file: # again, example for a specific cluster count
    pickle.dump(clustering, file)

In [None]:
df['label'] = clustering.labels_

In [None]:
import numpy as np
import matplotlib.pyplot as plt

categories = label_counts.index
x = np.arange(len(categories))
width = 0.1

fig, ax = plt.subplots(figsize=(12, 6))

bars = []
for i in range(10):  # I adjusted this for each number of clusters
    bars.append(ax.bar(x - (width * 3.5) + i * width, label_counts[i], width,
                       label=f'{i}', color=plt.cm.get_cmap('tab10')(i), edgecolor='black'))


ax.set_xlabel('Category', fontsize=18)
ax.set_ylabel('Count', fontsize=18)
ax.set_title('Ten Clusters -- Euclidean', fontsize=18)
ax.set_xticks(x)
ax.set_xticklabels(categories)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
ax.legend(ncol=4, loc='upper center', bbox_to_anchor=(0.5, -0.3))
fig.subplots_adjust(bottom=0.2)
ax.set_xlabel('Category', labelpad=15)

plt.tight_layout()
plt.savefig('/content/ten_clusters_consistent.png')
plt.show()

# Modeling

In [None]:
X_all = np.load('/content/data.npy', allow_pickle=True)
X_test = X_all[5000:8000]
X = X_all[:5000]

In [None]:
INPUT = {
    'post': X[:, 5],
}
TEST_INPUT = {
    'post': X_test[:, 5],
}

In [None]:
lbe = LabelEncoder()
TEST_OUTPUT = X_test[:, 1].astype(np.float32)
TEST_OUTPUT = lbe.fit_transform(TEST_OUTPUT)

OUTPUT = X[:, 1].astype(np.float32)
OUTPUT = lbe.fit_transform(OUTPUT)

In [None]:
for key in INPUT:
  if isinstance(INPUT[key][0], list):
    nested_arrays = [np.array(inner_list, dtype=np.float32) for inner_list in INPUT[key]]
    INPUT[key] = np.array(nested_arrays, dtype=np.float32)

In [None]:
for key in TEST_INPUT:
  if isinstance(TEST_INPUT[key][0], list):
    nested_arrays = [np.array(inner_list, dtype=np.float32) for inner_list in TEST_INPUT[key]]
    TEST_INPUT[key] = np.array(nested_arrays, dtype=np.float32)

In [None]:
X_train_gender = torch.from_numpy(INPUT['post'])
y_train_gender = torch.from_numpy(np.expand_dims(OUTPUT, 1).astype(np.float32))

train_dataset_gender = TensorDataset(X_train_gender, y_train_gender)
train_loader_gender = DataLoader(train_dataset_gender, batch_size=32, shuffle=True, drop_last=True)

In [None]:
crit = nn.CrossEntropyLoss()

class ClassifierNN(nn.Module):
   def __init__(self, input_dim, num_classes, seq_length, hidden_dim=128, nhead=4, num_encoder_layers=2):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nhead)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=num_encoder_layers)
        self.classify = nn.Linear(hidden_dim, num_classes)
   def forward(self, x):
        x = self.transformer_encoder(torch.relu(self.fc1(x)).unsqueeze(1))
        x = x[:, -1, :]
        x = self.classify(x)
        return x

In [None]:
input_dim = X_train_gender.shape[1]
num_classes = 5
seq_length = 1
model = ClassifierNN(input_dim, num_classes, seq_length)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
epochs = 200
for epoch in range(epochs):
  model.train()
  loss_total = 0
  for inputs, labels in train_loader_gender:
    outputs = model(inputs)
    labels = labels.squeeze().long()
    loss = crit(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    loss_total += loss.item()
  print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader_gender)}")

In [None]:
df = pd.read_csv('/content/SBIC.v2.trn.csv')
df_test = df[5000:8000]

In [None]:
gender_indices_test = df_test[df_test['targetCategory'] == 'gender'].index - 5000
X_test_gender = torch.from_numpy(TEST_INPUT['post'][gender_indices_test])
y_test_gender = torch.from_numpy(TEST_OUTPUT[gender_indices_test]).squeeze()

test_dataset = TensorDataset(X_test_gender, y_test_gender)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [None]:
def evaluate_model(model, dataloader, criterion, device='cpu'):
    model.eval()
    loss_total = 0
    correct_preds = 0
    total_preds = 0
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    with torch.no_grad():
      for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        labels = labels.squeeze().long()
        loss = crit(outputs, labels)
        loss_total += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_preds += (predicted == labels).sum().item()
        total_preds += labels.size(0)

        positive_pred = (predicted == 2) | (predicted == 3) # intent to offend is either probably or definitely
        negative_pred = (predicted == 0) | (predicted == 1) # intent to offend is probably not or definitely not

        positive_labels = (labels == 2) | (labels == 3)
        negative_labels = (labels == 0) | (labels == 1)

        fn += (positive_labels & negative_pred).sum().item()
        tn += (negative_labels & negative_pred).sum().item()
        tp += (positive_labels & positive_pred).sum().item()
        fp += (negative_labels & positive_pred).sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_preds / total_preds * 100
    precision = tp / (tp + fp) * 100
    recall = tp / (tp + fn) * 100
    f1 = (2 * precision * recall) / (precision + recall)
    return avg_loss, accuracy, precision, recall, f1

In [None]:
avg_loss, accuracy, precision, recall, f1 = evaluate_model(model, test_loader, crit, device)

print(f"Test Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.2f}%, Precision: {precision:.2f}%, , Recall: {recall:.2f}%, , F1: {f1:.2f}%")