In [1]:
from transformers import BertModel, BertTokenizer
import torch

# Load pre-trained model tokenizer (vocabulary) and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Set the model to inference model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [2]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset from a txt file
df = pd.read_csv('data3.txt', sep='\t', quoting=3, engine='python', on_bad_lines='skip')
print(df.shape)


(2119, 2)


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Stopwords and punctuation setup
stop_words = set(stopwords.words('english'))

# Clean text
def clean_text(text):
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

df['Processed_Text'] = df['Text'].apply(clean_text)

In [4]:
def get_bert_embeddings(text):
    # Encode text using tokenizer
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # Get output from BERT model
    with torch.no_grad():
        output = model(**encoded_input)
    # Get embeddings from the last hidden state
    embeddings = output.last_hidden_state[:, 0, :].squeeze().numpy()  # Taking the [CLS] token representation
    return embeddings

# Apply function to text data (may need to batch this operation depending on dataset size)
df['bert_embeddings'] = df['Text'].apply(get_bert_embeddings)

In [5]:
X = np.vstack(df['bert_embeddings'])
y = df['Cause'].values

In [None]:
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a more extensive parameter grid
parameter_grid = {
    'C': [0.1, 1, 2,3,4,5,6,7,8,9, 10,20,30,40,50, 100],  # Regularization parameter
    'gamma': [0.001, 0.01, 0.1, 1,2,3,4,5,6,7,8,9,10, 'scale', 'auto'],  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Different types of kernels
    'degree': [2, 3, 4,5,6,7],  # Degree of the polynomial kernel function (if 'poly' kernel is used)
    'class_weight': [None, 'balanced']  # Use 'balanced' to adjust weights inversely proportional to class frequencies
}

svm_model = SVC()

# Setup GridSearchCV
grid_search = GridSearchCV(svm_model, parameter_grid, cv=10, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

Fitting 10 folds for each of 11520 candidates, totalling 115200 fits


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM with the best parameters
best_svm = SVC(**grid_search.best_params_)
best_svm.fit(X_train, y_train)
y_pred = best_svm.predict(X_test)

# Performance report
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d")
plt.title('Confusion Matrix')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()