Load data from the data set into test, training and validation sets

In [None]:
import pandas as pd
import numpy as np
import torch
from datetime import datetime

test = pd.read_csv("datasets/test.csv")
training = pd.read_csv("datasets/training.csv")
validation = pd.read_csv("datasets/validation.csv")

print("Dataset information: ")
print(f'Training data: {training.shape}')
print(f'Validation data: {validation.shape}')
print(f'Test data: {test.shape}')

Load tokenizer, model and create functions needed to process text

In [None]:
import string
import re
from transformers import DistilBertTokenizer, DistilBertModel
import contractions

# Load BERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertModel.from_pretrained('distilbert-base-cased')


def preprocess_text(text):
    # remove links
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    # remove words like href, img, www, http, width, height, src
    text = re.sub(r"\S*(href|img|www|http|width|height|src)\S*", "", text)
    # remove contractions
    text = contractions.fix(text)
    # remove special characters
    text = text.translate(str.maketrans('', '', string.punctuation))

    encoded_input = tokenizer.encode_plus(
        text,
        add_special_tokens=False,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return encoded_input['input_ids'], encoded_input['attention_mask']

def extract_features(inputs, masks):
    print('Extract features' + ' - ' + str(datetime.now()))
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
        pooled_output = outputs[0][:, 0, :]
    print('Finish extract features' + ' - ' + str(datetime.now()))
    return pooled_output.numpy()

Perform batch text preprocessing

In [None]:
# Preprocess training dataset
batch_size = 500
num_batches = len(training) // batch_size
training_features = []
for i in range(num_batches):
    print('Begin preprocess batch ' + str(i) + ' - ' + str(datetime.now()))
    # Get the current batch
    batch_data = training[i * batch_size : (i + 1) * batch_size]['text']
    batch_labels = training[i * batch_size : (i + 1) * batch_size]['label']

    # Preprocess the input text using DistilBERT for the current batch
    batch_inputs = []
    batch_masks = []
    print('Begin preprocess text in batch ' + str(i) + ' - ' + str(datetime.now()))
    for text in batch_data:
        input_ids, attention_mask = preprocess_text(text)
        batch_inputs.append(input_ids)
        batch_masks.append(attention_mask)

    batch_inputs = torch.cat(batch_inputs, dim=0)
    batch_masks = torch.cat(batch_masks, dim=0)

    pooled_output = extract_features(batch_inputs, batch_masks)
    training_features.append(pooled_output)
    print('Finish preprocess batch ' + str(i) + ' - ' + str(datetime.now()))
# obtain final features and labels
training_features = np.concatenate(training_features, axis=0)
training_labels = training['label'].values[:training_features.shape[0]]

# save the features and labels
np.save('models/training_features-3.npy', training_features)
np.save('models/training_labels-3.npy', training_labels)

Choose a suitable subset of the training data to find the best parameters for the SVM model

In [None]:
from sklearn.svm import SVC

# load the training features and labels
training_features = np.load('models/training_features-3.npy')
training_labels = np.load('models/training_labels-3.npy')

subset_size = 0.3
# split the training data into a subset and the remaining data
subset_indices = np.random.choice(len(training_features), int(subset_size * len(training_features)), replace=False)
subset_features = training_features[subset_indices]

# compute the class distribution of the subset
subset_labels = training_labels[subset_indices]
subset_class_distribution = np.bincount(subset_labels)

# compute the class distribution of the entire training set
training_class_distribution = np.bincount(training_labels)

# compare the class distributions
print('Class distribution of the entire training set: ', training_class_distribution)
print('Class distribution of the subset: ', subset_class_distribution)

# visualize plots of the class distributions
import matplotlib.pyplot as plt
plt.bar(np.arange(len(training_class_distribution)), training_class_distribution, label='Training set')
plt.bar(np.arange(len(subset_class_distribution)), subset_class_distribution, label='Subset')
plt.legend()
plt.show()


Perform grid search to find the best parameters for the SVM model

In [None]:
# SVM classification
parameters = {
    'C': [ 1, 50, 75, 0.1],
    'gamma': [1, 0.01, 'auto' , 'scale'],
    'kernel': ['rbf', 'poly', 'linear']
}
# Construct the final SVM
final_svm_classifier = SVC()
# Perform grid search to find the best parameters
from sklearn.model_selection import GridSearchCV
print('Begin grid search' + ' - ' + str(datetime.now()))
grid_search = GridSearchCV(final_svm_classifier, parameters, cv=5)
print('Training SVM to find the best hyperparameters' + ' - ' + str(datetime.now()))
grid_search.fit(subset_features, subset_labels)
print('Finished looking for best hyperparameters' + ' - ' + str(datetime.now()))
print('Best hyperparameters: ', grid_search.best_params_)


Train the final SVM model with the best parameters

In [None]:
import joblib
# Use the best model for prediction
final_svm_classifier = grid_search.best_estimator_
# use the already existing model
print('Begin training final SVM model' + ' - ' + str(datetime.now()))
final_svm_classifier.fit(training_features, training_labels)
print('Finished training final SVM model' + ' - ' + str(datetime.now()))

Save the generated model for later use

In [None]:
# Save generated SVM model
import joblib
joblib.dump(final_svm_classifier, 'models/model-3.pkl')

Process test data

In [None]:
test_labels = []
test_features = []
# Extract features for test dataset
test_batch_size = 200
num_batches = len(test) // test_batch_size

for i in range(num_batches):
    print('Begin processing for test batch ' + str(i) + ' - ' + str(datetime.now()))
    test_batch_data = test[i * test_batch_size : (i + 1) * test_batch_size]['text']
    test_batch_labels = test[i * test_batch_size : (i + 1) * test_batch_size]['label']
    test_labels.extend(test_batch_labels)
    test_batch_inputs = []
    test_batch_masks = []
    for text in test_batch_data:
        input_ids, attention_mask = preprocess_text(text)
        test_batch_inputs.append(input_ids)
        test_batch_masks.append(attention_mask)
    test_batch_inputs = torch.cat(test_batch_inputs, dim=0)
    test_batch_masks = torch.cat(test_batch_masks, dim=0)
    print(i)
    pooled_output = extract_features(test_batch_inputs, test_batch_masks)
    test_features.append(pooled_output)
    print('End processing for test batch ' + str(i) + ' - ' + str(datetime.now()))

test_labels = torch.tensor(test_labels)
all_test_features = np.concatenate(test_features, axis=0)

# save the extracted test features and test labels for later use
np.save('models/test_features-2.npy', all_test_features)
np.save('models/test_labels-2.npy', test_labels)

Predict for the test set

In [None]:
from sklearn.metrics import classification_report
# load the test features and labels
all_test_features = np.load('models/test_features-2.npy')
test_labels = np.load('models/test_labels-2.npy')
print('Begin prediction' + ' - ' + str(datetime.now()))
loaded_svm_classifier = joblib.load('models/model-3.pkl')
predictions = loaded_svm_classifier.predict(all_test_features)
print('Finish prediction' + ' - ' + str(datetime.now()))
report = classification_report(test_labels, predictions)
print(report)

In [None]:
test_case1 = 'i do feel completely isolated'
loaded_svm_classifier = joblib.load('models/model-7-over.pkl')
input_ids, attention_mask = preprocess_text(test_case1)
test_features = extract_features(input_ids, attention_mask)
prediction = loaded_svm_classifier.predict(test_features)
print("Prediction: ", prediction)