# Load pretrained word embedding

In [2]:
import pickle
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/MyDrive/Colab Notebooks'
embeddings_dict = pickle.load(open(f'{path}/embeddings_dict.pkl', 'rb'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Define LSTM

In [3]:
import torch.nn as nn
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

def get_weights(target_vocab, embeddings_dict, embedding_dim=100):
    matrix_len = len(target_vocab)
    weights_matrix = np.zeros((matrix_len, embedding_dim))
    words_found = 0

    for i, word in enumerate(target_vocab):
        try: 
            weights_matrix[i] = embeddings_dict[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim, ))  

    print("Fraction of vocab words found in word embedding: ", words_found/matrix_len)
    return torch.tensor(weights_matrix)

def create_emb_layer(weights_matrix, trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if not trainable:
        emb_layer.weight.requires_grad = False
      
    return emb_layer, num_embeddings, embedding_dim  

class SentimentNet(nn.Module):
    def __init__(self, target_vocab, embeddings_dict, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        # Learn new word embedding
        # vocab_size = len(target_vocab)
        # self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Use pretrained word embedding
        weights_matrix = get_weights(target_vocab, embeddings_dict, embedding_dim)
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, False)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

# Preprocessing

In [8]:
# Pad/truncate sentence to fixed length
# https://arxiv.org/abs/1903.07288 pad zeros in front
def pad_input(sentences, seq_len=200):
    features = np.zeros((len(sentences), seq_len), dtype=int)
    for i, tokens in enumerate(sentences):
        if len(tokens) > seq_len:
            features[i] = np.array(tokens)[:seq_len]
        elif len(tokens) > 0:
            features[i, -len(tokens):] = np.array(tokens)
    return features

def preprocess_input(X, y, seq_len=100):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
    X_test, X_val, y_test, y_val = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

    words = Counter() 

    for i, sentence in enumerate(X_train):
        # The sentences will be stored as a list of words/tokens
        X_train[i] = []
        for word in nltk.word_tokenize(sentence):  # Tokenizing the words
            words.update([word.lower()])  # Converting all the words to lowercase
            X_train[i].append(word)
    
    # # Removing the words that only appear once
    words = {k:v for k,v in words.items() if v>1}
    # # Sorting the words according to the number of appearances, with the most common word being first
    words = sorted(words, key=words.get, reverse=True)
    # Adding padding and unknown to our vocabulary so that they will be assigned an index
    words = ['_PAD'] + words

    # Dictionaries to store the word to index mappings and vice versa
    word2idx = {o:i for i,o in enumerate(words)}
    idx2word = {i:o for i,o in enumerate(words)}

    for i, sentence in enumerate(X_train):
        X_train[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]

    for i, sentence in enumerate(X_val):
        X_val[i] = [word2idx[word.lower()] if word.lower() in word2idx else 0 for word in nltk.word_tokenize(sentence)]

    for i, sentence in enumerate(X_test):
        X_test[i] = [word2idx[word.lower()] if word.lower() in word2idx else 0 for word in nltk.word_tokenize(sentence)]

    train_sentences = pad_input(X_train, seq_len)
    val_sentences = pad_input(X_val, seq_len)
    test_sentences = pad_input(X_test, seq_len)

    return train_sentences, val_sentences, test_sentences, y_train, y_val, y_test, words, idx2word

# Main

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

import math
from collections import Counter
import nltk
nltk.download('punkt')
from nltk import word_tokenize  


def main():
    url = "https://raw.githubusercontent.com/calvincxz/CS4248_Project/main/train2.csv"
    train = pd.read_csv(url)
    X = np.array(train['comment_text'])
    y = np.array(train['toxic_label'])

    size = 5000
    X = X[:size]
    y = y[:size]

    train_sentences, val_sentences, test_sentences, \
    y_train, y_val, y_test, vocab, idx2word = preprocess_input(X, y)

    train_data = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(y_train))
    val_data = TensorDataset(torch.from_numpy(val_sentences), torch.from_numpy(y_val))
    test_data = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(y_test))

    batch_size = 200
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
    val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size, drop_last=True)
    test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)

    output_size = 1
    embedding_dim = 100
    hidden_dim = 256
    n_layers = 2
    vocab_size = len(vocab)

    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    print(device)

    

    model = SentimentNet(vocab, embeddings_dict, output_size, embedding_dim, hidden_dim, n_layers)
    model.to(device)

    lr=0.01
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    epochs = 10
    counter = 0
    print_every = int(0.5 * len(y_train) / batch_size)
    clip = 5
    valid_loss_min = np.Inf

    # Set seed
    torch.manual_seed(1)

    model.train()
    for i in range(epochs):
        h = model.init_hidden(batch_size)
        
        for inputs, labels in train_loader:
            counter += 1
            h = tuple([e.data for e in h])
            inputs, labels = inputs.to(device), labels.to(device)
            model.zero_grad()
            output, h = model(inputs, h)
            loss = criterion(output.squeeze(), labels.float())
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()
            
            if counter % print_every == 0:
                val_h = model.init_hidden(batch_size)
                val_losses = []
                model.eval()
                for val_input, val_label in val_loader:
                    val_h = tuple([each.data for each in val_h])
                    val_input, val_label = val_input.to(device), val_label.to(device)
                    out, val_h = model(val_input, val_h)
                    val_loss = criterion(out.squeeze(), val_label.float())
                    val_losses.append(val_loss.item())
                    
                model.train()
                print("Epoch: {}/{}...".format(i+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()),
                      "Val Loss: {:.6f}".format(np.mean(val_losses)))
                if np.mean(val_losses) <= valid_loss_min:
                    # torch.save(model.state_dict(), './state_dict.pt')
                    print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                    valid_loss_min = np.mean(val_losses)

    test_losses = []
    h = model.init_hidden(batch_size)
    test_inputs = np.array([])
    outputs = np.array([])
    test_labels = np.array([])

    model.eval()
    with torch.no_grad():
        for inputs, labels in test_loader:
            h = tuple([each.data for each in h])
            inputs, labels = inputs.to(device), labels.to(device)
            output, h = model(inputs, h)
            test_loss = criterion(output.squeeze(), labels.float())
            test_losses.append(test_loss.item())
            pred = torch.round(output.squeeze())

            if len(test_inputs) == 0:
                test_inputs = inputs.cpu().numpy()
            else:
                test_inputs = np.concatenate([test_inputs, inputs.cpu().numpy()])
            outputs = np.concatenate([outputs, pred.cpu().numpy()])
            test_labels = np.concatenate([test_labels, labels.cpu().numpy()])
        
        f1score = f1_score(np.array(test_labels), outputs)
        recall = recall_score(np.array(test_labels), outputs)
        precision = precision_score(np.array(test_labels), outputs)
        accuracy = accuracy_score(np.array(test_labels), outputs)
        print(f"Test F1 score: {f1score}")
        print(f"Test recall: {recall}")
        print(f"Test precision: {precision}")
        print(f"Test accuracy: {accuracy}")
        print("##############################################################")
        # Print some wrong predictions
        for i, input in enumerate(test_inputs[:100]):
            if outputs[i] != test_labels[i]:
              input_no_pad = input[input != 0]
              print(f"Label: {test_labels[i]}, Prediction: {outputs[i]}")
              print(" ".join([idx2word[i] for i in input_no_pad]))
              print()
if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
cuda
Fraction of vocab words found in word embedding:  0.8813559322033898
Epoch: 1/10... Step: 10... Loss: 0.337643... Val Loss: 0.308498
Validation loss decreased (inf --> 0.308498).  Saving model ...
Epoch: 1/10... Step: 20... Loss: 0.353720... Val Loss: 0.308544
Epoch: 2/10... Step: 30... Loss: 0.344862... Val Loss: 0.296293
Validation loss decreased (0.308498 --> 0.296293).  Saving model ...
Epoch: 2/10... Step: 40... Loss: 0.254464... Val Loss: 0.283745
Validation loss decreased (0.296293 --> 0.283745).  Saving model ...
Epoch: 3/10... Step: 50... Loss: 0.265329... Val Loss: 0.242124
Validation loss decreased (0.283745 --> 0.242124).  Saving model ...
Epoch: 3/10... Step: 60... Loss: 0.232867... Val Loss: 0.262773
Epoch: 4/10... Step: 70... Loss: 0.230754... Val Loss: 0.212570
Validation loss decreased (0.242124 --> 0.212570).  Saving model ...
Epoch: 4/10... Step: 80... 

# Train Logistic Regression

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

import math
from collections import Counter
import nltk
nltk.download('punkt')
from nltk import word_tokenize  


def main():
    url = "https://raw.githubusercontent.com/calvincxz/CS4248_Project/main/train2.csv"
    train = pd.read_csv(url)
    X = np.array(train['comment_text'])
    y = np.array(train['toxic_label'])

    size = 10000
    X = X[:size]
    y = y[:size]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
    X_test, X_val, y_test, y_val = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

    model = LogisticRegression(random_state=0, solver='sag', max_iter=200)
    vectorizer = TfidfVectorizer()
    X_train_tf_idf_matrix = vectorizer.fit_transform(X_train)
    model.fit( X_train_tf_idf_matrix, y_train)

    # test your model
    vectorizer_val = TfidfVectorizer(vocabulary=vectorizer.get_feature_names())
    # vectorizer_val = TfidfVectorizer(vocabulary=vocab)
    X_val_tf_idf_matrix = vectorizer_val.fit_transform(X_val)

    y_pred = model.predict(X_val_tf_idf_matrix)
    score = f1_score(y_val, y_pred, average='macro')
    acc = accuracy_score(y_val, y_pred)
    print('F1 score on validation = {}'.format(score))
    print('accuracy = {}'.format(acc))
 
if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
F1 score on validation = 0.7162126068376068
accuracy = 0.932
