In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn

In [308]:
data = pd.read_csv('data/train.csv')
data.drop(['id'], axis=1, inplace=True)
tokenzier = SentenceTransformer('msmarco-distilbert-base-v4', device="cpu")

## Uncomment to encode train-inputs

In [None]:
# vectors = []
# batch_size = 128
# batch = []
# for row in tqdm(data.itertuples()):
#     comment_text =row.comment_text
#     batch.append(comment_text)
#     if len(batch) >= batch_size:
#         vectors.append(model.encode(batch))  # Text -> vector encoding happens here
#         batch = []

# if len(batch) > 0:
#     vectors.append(model.encode(batch))
#     batch = []

# vectors = np.concatenate(vectors)

In [309]:
vectors = np.load('sentence-embedding/input_embed.npy', allow_pickle=False)

In [310]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
Xtrain = vectors[:120000]
Xtest = vectors[120000:]
y = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
ytrain = y[:120000]
ytest = y[120000:]

In [311]:
class ToxicDataset(Dataset):

    def __init__(self, input, target: pd.DataFrame):
        self.input = input
        self.target = target

    def __len__(self):
        return len(self.input)

    def __getitem__(self, index: int):

        input_embed = self.input[index]
        target_row = self.target.iloc[index]
        toxic = target_row['toxic']
        severe_toxic = target_row['severe_toxic']
        obscene = target_row['obscene']
        threat = target_row['threat']
        insult = target_row['insult']
        identity_hate = target_row['identity_hate']

        return dict(
            input = input_embed,
            toxic = toxic,
            severe_toxic = severe_toxic,
            obscene = obscene,
            threat = threat,
            insult = insult,
            identity_hate = identity_hate
        )

In [312]:
train_dataset = ToxicDataset(Xtrain, ytrain)
test_dataset = ToxicDataset(Xtest, ytest)

In [7]:
class MainClassifier(nn.Module):
    def __init__(self,
                 device: str,
                 embedding_dim: int = 768,
                 num_layers: int = 2,
                 num_classes: int = 2) -> 'MainClassifier':
        super(MainClassifier, self).__init__()
        self.device = device
        self.num_layers = num_layers
        torch.manual_seed(42)
        self.fc = nn.ModuleList()
        for _ in range(num_layers):
            self.fc.append(nn.Linear(embedding_dim, embedding_dim))

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self._classification_layer = nn.Linear(embedding_dim, num_classes).to(device)

    def forward(self,
             inputs,
             training=False):
        for i in range(self.num_layers):
            if i == self.num_layers - 1:
                z = self.fc[i](inputs)
                z = self.dropout(z)
            else:
                z = self.fc[i](inputs)
                z = self.dropout(z)
                z = self.relu(z)

        logits = self._classification_layer(z)
        return logits

## Uncomment to train the model

In [None]:
# toxic_model = MainClassifier(device=device).to(device)
# severe_toxic_model = MainClassifier().to(device)
# obscene_model = MainClassifier().to(device)
# insult_model = MainClassifier().to(device)
# identity_hate_model = MainClassifier().to(device)
# threat_model = MainClassifier().to(device)

# optimizer = torch.optim.AdamW(toxic_model.parameters(), lr=0.001)
# model_toxic_dict = train(toxic_model, 'toxic', train_dataset, test_dataset, optimizer=optimizer, num_epochs=5, batch_size=128)

# optimizer = torch.optim.AdamW(severe_toxic_model.parameters(), lr=0.001)
# model_severe_dict = train(severe_toxic_model, 'severe_toxic', train_dataset, test_dataset, optimizer=optimizer, num_epochs=5, batch_size=128)

# optimizer = torch.optim.AdamW(obscene_model.parameters(), lr=0.001)
# model_obscene_dict = train(obscene_model, 'obscene', train_dataset, test_dataset, optimizer=optimizer, num_epochs=5, batch_size=128)

# optimizer = torch.optim.AdamW(threat_model.parameters(), lr=0.001)
# model_threat_dict = train(threat_model, 'threat', train_dataset, test_dataset, optimizer=optimizer, num_epochs=5, batch_size=128)

# optimizer = torch.optim.AdamW(insult_model.parameters(), lr=0.001)
# model_insult_dict = train(insult_model, 'insult', train_dataset, test_dataset, optimizer=optimizer, num_epochs=5, batch_size=128)

# optimizer = torch.optim.AdamW(identity_hate_model.parameters(), lr=0.001)
# model_identity_hate_dict = train(identity_hate_model, 'identity_hate', train_dataset, test_dataset, optimizer=optimizer, num_epochs=5, batch_size=128)

In [8]:
model_toxic = MainClassifier(device='cpu')
state = torch.load('models/toxic_model.pkg', map_location=torch.device('cpu'))
model_toxic.load_state_dict(state['model'])
model_toxic.eval()

MainClassifier(
  (fc): ModuleList(
    (0): Linear(in_features=768, out_features=768, bias=True)
    (1): Linear(in_features=768, out_features=768, bias=True)
  )
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (_classification_layer): Linear(in_features=768, out_features=2, bias=True)
)

In [9]:
model_severe_toxic = MainClassifier(device='cpu')
state = torch.load('models/severe_toxic_model.pkg', map_location=torch.device('cpu'))
model_severe_toxic.load_state_dict(state['model'])
model_severe_toxic.eval()

MainClassifier(
  (fc): ModuleList(
    (0): Linear(in_features=768, out_features=768, bias=True)
    (1): Linear(in_features=768, out_features=768, bias=True)
  )
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (_classification_layer): Linear(in_features=768, out_features=2, bias=True)
)

In [10]:
model_insult = MainClassifier(device='cpu')
state = torch.load('models/insult_model.pkg', map_location=torch.device('cpu'))
model_insult.load_state_dict(state['model'])
model_insult.eval()

MainClassifier(
  (fc): ModuleList(
    (0): Linear(in_features=768, out_features=768, bias=True)
    (1): Linear(in_features=768, out_features=768, bias=True)
  )
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (_classification_layer): Linear(in_features=768, out_features=2, bias=True)
)

In [11]:
model_threat = MainClassifier(device='cpu')
state = torch.load('models/threat_model.pkg', map_location=torch.device('cpu'))
model_threat.load_state_dict(state['model'])
model_threat.eval()

MainClassifier(
  (fc): ModuleList(
    (0): Linear(in_features=768, out_features=768, bias=True)
    (1): Linear(in_features=768, out_features=768, bias=True)
  )
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (_classification_layer): Linear(in_features=768, out_features=2, bias=True)
)

In [12]:
model_obscene = MainClassifier(device='cpu')
state = torch.load('models/obscene_model.pkg', map_location=torch.device('cpu'))
model_obscene.load_state_dict(state['model'])
model_obscene.eval()

MainClassifier(
  (fc): ModuleList(
    (0): Linear(in_features=768, out_features=768, bias=True)
    (1): Linear(in_features=768, out_features=768, bias=True)
  )
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (_classification_layer): Linear(in_features=768, out_features=2, bias=True)
)

In [13]:
model_identity_hate = MainClassifier(device='cpu')
state = torch.load('models/identity_hate_model.pkg', map_location=torch.device('cpu'))
model_identity_hate.load_state_dict(state['model'])
model_identity_hate.eval()

MainClassifier(
  (fc): ModuleList(
    (0): Linear(in_features=768, out_features=768, bias=True)
    (1): Linear(in_features=768, out_features=768, bias=True)
  )
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (_classification_layer): Linear(in_features=768, out_features=2, bias=True)
)

In [14]:
test_df = pd.read_csv('data/test.csv')

## Uncomment the following to encode the test-input

In [None]:
# test_vectors = []
# batch_size = 512
# batch = []
# for row in tqdm(merged_df.itertuples()):
#     comment_text =row.comment_text
#     batch.append(comment_text)
#     if len(batch) >= batch_size:
#         test_vectors.append(tokenizer.encode(batch))  # Text -> vector encoding happens here
#         batch = []

# if len(batch) > 0:
#     test_vectors.append(tokenizer.encode(batch))
#     batch = []

# test_vectors = np.concatenate(test_vectors)

In [22]:
test_vectors = torch.from_numpy(np.load('sentence-embedding/test_input_embed.npy', allow_pickle=False))

In [23]:
logits_toxic = model_toxic(test_vectors)
logits_s_toxic = model_severe_toxic(test_vectors)
logits_threat = model_threat(test_vectors)
logits_obscene = model_obscene(test_vectors)
logits_insult = model_insult(test_vectors)
logits_i_hate = model_identity_hate(test_vectors)

toxic_predictions = torch.argmax(logits_toxic, dim=1).detach().cpu().numpy()
s_toxic_predictions = torch.argmax(logits_s_toxic, dim=1).detach().cpu().numpy()
threat_predictions = torch.argmax(logits_threat, dim=1).detach().cpu().numpy()
obscene_predictions = torch.argmax(logits_obscene, dim=1).detach().cpu().numpy()
insult_predictions = torch.argmax(logits_insult, dim=1).detach().cpu().numpy()
hate_predictions = torch.argmax(logits_i_hate, dim=1).detach().cpu().numpy()

In [24]:
test_df['toxic'] = toxic_predictions
test_df['severe_toxic'] = s_toxic_predictions
test_df['threat'] = threat_predictions
test_df['obscene'] = obscene_predictions
test_df['insult'] = insult_predictions
test_df['identity_hate'] = hate_predictions

In [25]:
result = test_df.drop(['comment_text'], axis=1)

In [27]:
result.to_csv('prediction_submission.csv', index=False)