Not in venv:
pip install huggingface-hub==0.26.2

In venv:
pip install numpy==1.26.4 scipy==1.13.1 gensim==4.3.3 pandas==2.2.3 nltk==3.9.1 transformers==4.46.2

- cpu version
pip3 install torch torchvision torchaudio
- gpu version
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

Model naming convention: 
`<EmbeddingType>_<ArchitectureType>_<LayerConfig>_<Activation>_<Extras>_<Task>`

#### Example Components

- **Embedding Type**: Specify the input representation.  
  Examples: `W2V` for Word2Vec, `BERT`, `GloVe`, `TF-IDF`, etc.

- **Architecture Type**: Include the model type.  
  Examples: `NN`, `LSTM`, `GRU`, `TF` (Transformer), etc.

- **Layer Configuration**: Use layer sizes or count.  
  Examples: `128-64-32` for layer sizes or `3L` for 3 layers.

- **Activation Function**: Specify the activation function.  
  Examples: `ReLU`, `LeakyReLU`, `Tanh`, etc.

- **Extras**: Include regularization, dropout, or batch normalization (if relevant).  
  Examples: `DO30` for 30% dropout, `BN` for BatchNorm.

- **Task**: Add a suffix to describe the task.  
  Examples: `MC` for multi-class classification, `SC` for single-class classification.

# LPD - libraries, packages, data

In [32]:
# venv_lic\Scripts\activate
# venv_lic\Scripts\deactivate

# venv_pc\Scripts\activate
# venv_pc\Scripts\deactivate

import pandas as pd
import numpy as np
import re
import string
import os
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

import gensim
import gensim.downloader

import nltk
# nltk.download('punkt')
# nltk.download('punkt_tab')
# nltk.download('stopwords')
# nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# PyTorch info
print("PyTorch version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")
print("CUDA Version:", torch.version.cuda)

In [None]:
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))
# 'word2vec-ruscorpora-300'
# 'word2vec-google-news-300'

In [2]:
# Download the embeddings
word2vec = gensim.downloader.load('word2vec-google-news-300')


In [36]:
# word2vec checking

# word2vec.most_similar('coin')
# word2vec.most_similar(positive=['king', 'woman'], negative=['man'])
# word2vec.most_similar(positive=['swim', 'basketball'], negative=['pool'])

In [3]:
data_path = os.path.join(os.environ['USERPROFILE'], r"OneDrive - SGH\1. SGH\Praca licencjacka\files\data")
data = pd.read_csv(rf'{data_path}\reddit_sentiment_august2021.csv', index_col='Unnamed: 0')

In [4]:
# test_data = data[['body', 'BERT-Sentiment']].head(5)
# test_sentence = test_data.iloc[0]['body']
reddit_data = data[['body', 'BERT-Sentiment']]

In [None]:
# Adding sentiment_label column (binary column based on 'BERT-Sentiment' column)
# 'BERT-Sentiment' column: from 1 (negative) to 5 (positive).
# Label 0: Positive
# Label 1: Neutral
# Label 2: Negative
class_dict = class_dict = {0: "Positive", 1: "Neutral", 2: "Negative"}

# Check for unmatched values
unmatched_values = reddit_data[~reddit_data['BERT-Sentiment'].isin([1, 2, 3, 4, 5])]
if unmatched_values.empty:
    print("All good")

# Assign sentiment_label
reddit_data['sentiment_label'] = reddit_data['BERT-Sentiment'].apply(
    lambda x: 0 if x in [4, 5] else 1 if x == 3 else 2
)

# Create np array with labels
labels = np.array(reddit_data['sentiment_label'])

# I. Tokenization and embeddings

In [6]:
# Tokenization
def tokenize_function(text):
    return nltk.word_tokenize(text)

In [20]:
def bert_tokenize_function(text, bert_tokenizer, max_length=128):
    """
    Tokenizes input text using a BERT tokenizer.

    Args:
        text (str or List[str]): The text or list of sentences to tokenize.
        bert_tokenizer (transformers.BertTokenizer): A pre-trained BERT tokenizer.
        max_length (int): The maximum length of the tokenized sequences.

    Returns:
        dict: A dictionary with tokenized input (`input_ids`) and attention mask (`attention_mask`).
    """
    return bert_tokenizer(
        text,
        padding=True,             # Add padding to match max_length
        truncation=True,          # Truncate sequences longer than max_length
        max_length=max_length,    # Define the maximum sequence length
        return_tensors="pt"       # Return as PyTorch tensors
    )

In [8]:
emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002700-\U000027BF"  # dingbats
        "\U00002600-\U000026FF"  # miscellaneous symbols
        "\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
        "\U0001FA70-\U0001FAFF"  # symbols and pictographs extended-A
        "\U00002500-\U00002BEF"  # chinese symbols
        "]+",
        flags=re.UNICODE,
    )

In [9]:
def clean_body(body, emoji_pattern=emoji_pattern):
    body_clean = body.lower() # Lowercase
    body_clean = emoji_pattern.sub('', body_clean) # Remove emojis
    return body_clean

In [10]:
def clean_tokens(tokens, emoji_pattern=emoji_pattern):
    '''
    Cleaning tokens, removing ...
    '''
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    # Clean tokens
    tokens = [token.lower() for token in tokens]  # Lowercase
    tokens = [token for token in tokens if token not in stop_words]  # Remove stop words
    tokens = [token for token in tokens if token not in string.punctuation]  # Remove punctuation
    tokens = [re.sub(r'http\S+|www.\S+', '<URL>', token) for token in tokens]  # Replace URLs
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize
    tokens = [token for token in tokens if token.isalnum()]  # Remove non-alphanumeric
    tokens = [emoji_pattern.sub('', token) for token in tokens]  # Remove emojis
    
    return tokens

In [None]:
# Apply tokenization
    # rule based
reddit_data['clean_tokens'] = reddit_data['body'].apply(lambda x: clean_tokens(tokenize_function(x)))

    # cleaning for later BERT tokenizer
reddit_data['clean_body'] = reddit_data['body'].apply(lambda x: clean_body(x))

#### Sentence embedding from words embeddings

In [14]:
# Vector sentence representation
# output: 2D np array, shape is (300, num_sentences)

def sentence_to_embedding(tokens, word2vec, embedding_dim=300):
    """
    Convert a list of tokens into a single sentence embedding by averaging word vectors.
    """
    embeddings = [
        word2vec[token] for token in tokens if token in word2vec.key_to_index
    ]
    if embeddings:
        return np.mean(embeddings, axis=0)  # Average word vectors
    else:
        return np.zeros(embedding_dim)  # Return zero vector if no valid tokens

In [None]:
# Apply the function to generate sentence embeddings
reddit_data['sentence_embedding'] = reddit_data['clean_tokens'].apply(
    lambda tokens: sentence_to_embedding(tokens, word2vec)
)

# II. Train and Test sets preparation

In [17]:
train_data, test_data = train_test_split(reddit_data, test_size=0.2, random_state=42)

In [None]:
# Stack embeddings into a single NumPy array
sentence_embeddings_train = np.array(train_data['sentence_embedding'].tolist())  
sentence_embeddings_test = np.array(test_data['sentence_embedding'].tolist())  

# Convert sentence embeddings and labels to tensors
X_train = torch.tensor(sentence_embeddings_train, dtype=torch.float32).to(device)
X_test = torch.tensor(sentence_embeddings_test, dtype=torch.float32).to(device)



In [18]:
# Convert labels to NumPy arrays
labels_train = np.array(train_data['sentiment_label'].tolist())
labels_test = np.array(test_data['sentiment_label'].tolist())

y_train = torch.tensor(labels_train, dtype=torch.long).to(device)
y_test = torch.tensor(labels_test, dtype=torch.long).to(device)

In [22]:
# tokenization and split for BERT 
X_bert_train = bert_tokenize_function(train_data["clean_body"].tolist(), bert_tokenizer)
X_bert_test = bert_tokenize_function(test_data["clean_body"].tolist(), bert_tokenizer)

In [23]:
# Create TensorDatasets for BERT
train_dataset_bert = TensorDataset(
    X_bert_train["input_ids"].to(device), 
    X_bert_train["attention_mask"].to(device), 
    y_train.to(device)
)
test_dataset_bert = TensorDataset(
    X_bert_test["input_ids"].to(device), 
    X_bert_test["attention_mask"].to(device), 
    y_test.to(device)
)

# III. Word2Vec + NN

In [13]:
# # Model: 5 layers
# # input: vector dim (300,1); sentence vector representation; 
# # output: 1 output neuron, binary classification 

# # Define the model
# class Word2Vec_NN_binary(nn.Module):
#     def __init__(self):
#         super(Word2Vec_NN, self).__init__()
#         self.fc1 = nn.Linear(300, 128) # Input layer to hidden layer 1
#         self.fc2 = nn.Linear(128, 64)  # Hidden layer 1 to hidden layer 2
#         self.fc3 = nn.Linear(64, 32)   # Hidden layer 2 to hidden layer 3
#         self.fc4 = nn.Linear(32, 16)   # Hidden layer 3 to hidden layer 4
#         self.fc5 = nn.Linear(16, 1)    # Hidden layer 4 to output layer
#         self.relu = nn.ReLU()          # ReLU activation
#         self.sigmoid = nn.Sigmoid()    # Sigmoid activation for binary classification

#     def forward(self, x):
#         x = self.relu(self.fc1(x))
#         x = self.relu(self.fc2(x))
#         x = self.relu(self.fc3(x))
#         x = self.relu(self.fc4(x))
#         x = self.sigmoid(self.fc5(x))  # Output layer
#         return x

# # Initialize the model
# model = Word2Vec_NN()

In [None]:
# Model: 5 layers
# input: vector dim (300,1); sentence vector representation; 
# output: 3 output neurons, one for each class 

class W2V_NN_5L_ReLU_MC(nn.Module):
    def __init__(self):
        super(W2V_NN_5L_ReLU_MC, self).__init__()
        self.fc1 = nn.Linear(300, 128).to(device)  # Input layer to hidden layer 1
        self.fc2 = nn.Linear(128, 64).to(device)  # Hidden layer 1 to hidden layer 2
        self.fc3 = nn.Linear(64, 32).to(device)   # Hidden layer 2 to hidden layer 3
        self.fc4 = nn.Linear(32, 16).to(device)   # Hidden layer 3 to hidden layer 4
        self.fc5 = nn.Linear(16, 3).to(device)    # Hidden layer 4 to output layer (3 classes)
        self.relu = nn.ReLU().to(device)          # ReLU activation
        self.softmax = nn.Softmax(dim=1).to(device)  # Softmax activation for multi-class output

    def forward(self, x):
        x = x.to(device)
        x = self.relu(self.fc1(x))  # Pass through first layer with ReLU
        x = self.relu(self.fc2(x))  # Pass through second layer with ReLU
        x = self.relu(self.fc3(x))  # Pass through third layer with ReLU
        x = self.relu(self.fc4(x))  # Pass through fourth layer with ReLU
        x = self.fc5(x)             # Output layer (logits)
        x = self.softmax(x)         # Softmax activation for probabilities
        return x

# Initialize the model
model = W2V_NN_5L_ReLU_MC().to(device)

In [None]:
# Create a DataLoader for batching
dataset = TensorDataset(X.to(device), y.to(device))
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  # Batch size of 32

In [None]:
# Initialize the model
model = W2V_NN_5L_ReLU_MC().to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss().to(device)  # CrossEntropyLoss for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer with learning rate 0.001


In [None]:
# Training loop
num_epochs = 20  # Number of epochs

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0.0  # Track epoch loss

    for batch in dataloader:
        inputs, labels_tensor = batch  # Get a batch of inputs and labels
        inputs = inputs.to(device)
        labels_tensor = labels_tensor.to(device)

        optimizer.zero_grad()  # Clear previous gradients

        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, labels_tensor)  # Compute loss
        loss.backward()  # Backward pass: compute gradients
        optimizer.step()  # Update model weights

        epoch_loss += loss.item()  # Accumulate batch loss

    # Print average loss for this epoch
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(dataloader):.4f}")


In [None]:
# Save the model's weights after training
torch.save(model.state_dict(), "W2V_NN_5L_ReLU_MC_weights.pth")

In [None]:
# Preprocess the test sentence
# test_sentence = "This is very negative sentence, omg, I hate it"
# test_sentence = "The price is going down, you will lose all your money"
test_sentence = "Bitcoin is going up guys, we will be rich, nice, good"
# test_sentence = "omg bro this can't be real, we lost, price is very low, atl, omg"

test_sentence_token_words = clean_tokens(tokenize_function(test_sentence))  # Tokenize and clean
test_sentence_embedding = sentence_to_embedding(test_sentence_token_words, word2vec)  # Get embedding

# Convert embedding to PyTorch tensor
test_sentence_tensor = torch.tensor(test_sentence_embedding, dtype=torch.float32).unsqueeze(0)  # Shape: (1, 300)

# Load the trained model
model = Word2Vec_NN_multi_class()  # Use the same model architecture
model.load_state_dict(torch.load("model_weights.pth"))  # Load the saved weights
model.eval()  # Set the model to evaluation mode

# Get the model's prediction
test_sentence_output = model(test_sentence_tensor)  # Forward pass
predicted_class = torch.argmax(test_sentence_output, dim=1).item()  # Get the predicted class index
print(f"Predicted Sentiment: {class_dict[predicted_class]}")

# IV. Word2Vec + SVM

In [None]:
# Create an SVM pipeline with scaling and the classifier
svm_W2V = make_pipeline(
    StandardScaler(),  # Standardize features for better performance
    SVC(kernel='rbf', 
        C=1.0, 
        decision_function_shape='ovr')  # Linear kernel, OvR strategy
)

In [None]:
# Fit the model
svm_W2V.fit(sentence_embeddings_train, labels_train) 

In [None]:
# Preprocess the test sentence
# test_sentence = "This is very negative sentence, omg, I hate it"
# test_sentence = "The price is going down, you will lose all your money"
test_sentence = "Bitcoin price is increasing, well we have it guys, great job, good nice well perfect"
# test_sentence = "omg bro this can't be real, we lost, price is very low, atl, omg"

test_sentence_token_words = clean_tokens(tokenize_function(test_sentence))  # Tokenize and clean
test_sentence_embedding = sentence_to_embedding(test_sentence_token_words, word2vec)  # Get embedding


# Predict the class
predicted_class = svm_model.predict([test_sentence_embedding])[0]
print("Predicted Class:", class_dict[predicted_class])

# V. BERT + NN

In [None]:
from transformers import BertTokenizer, BertModel
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [24]:
# # testing plain BERT
# text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)

In [25]:
class BERT_NN_5L_ReLU_MC_DO30(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BERT_NN_5L_ReLU_MC_DO30, self).__init__()
        self.bert = bert_model.to(device)  # Pre-trained BERT model
        hidden_size = bert_model.config.hidden_size  # Typically 768 for BERT-base

        # Define a 5-layer feedforward neural network
        self.fc1 = nn.Linear(hidden_size, 512).to(device)  # Layer 1
        self.fc2 = nn.Linear(512, 256).to(device)         # Layer 2
        self.fc3 = nn.Linear(256, 128).to(device)         # Layer 3
        self.fc4 = nn.Linear(128, 64).to(device)          # Layer 4
        self.fc5 = nn.Linear(64, num_classes).to(device)  # Output Layer

        # Activation function and dropout
        self.activation = nn.ReLU().to(device)
        self.dropout = nn.Dropout(0.3).to(device)  # Dropout for regularization

    def forward(self, input_ids, attention_mask):
        # Get BERT outputs
        with torch.no_grad():  # Freeze BERT weights
            outputs = self.bert(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
        pooled_output = outputs.pooler_output  # [CLS] token representation

        # Pass through the 5-layer NN
        x = self.dropout(self.activation(self.fc1(pooled_output)))  # Layer 1
        x = self.dropout(self.activation(self.fc2(x)))              # Layer 2
        x = self.dropout(self.activation(self.fc3(x)))              # Layer 3
        x = self.dropout(self.activation(self.fc4(x)))              # Layer 4
        logits = self.fc5(x)                                        # Output Layer

        return logits


In [26]:
# Freeze BERT weights
for param in bert_model.parameters():
    param.requires_grad = False

# Initialize the sentiment analysis model
num_classes = 3  # For sentiment analysis (e.g., positive, negative, neutral)
sentiment_model = BERT_NN_5L_ReLU_MC_DO30(bert_model, num_classes).to(device)

# Define loss function and optimizer
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(sentiment_model.parameters(), lr=1e-3)

# Create DataLoaders
batch_size = 8
train_dataloader = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset_bert, batch_size=batch_size)


In [None]:
# BERT based model training loop

epochs = 5  #10

for epoch in range(epochs):
    sentiment_model.train()  # Set model to training mode
    total_loss = 0
    for input_ids, attention_mask, labels in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        # Move data to GPU
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        # Forward pass
        logits = sentiment_model(input_ids, attention_mask)

        # Compute loss
        loss = loss_fn(logits, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {total_loss / len(train_dataloader):.4f}")


In [28]:
# Save the model's weights after training
torch.save(sentiment_model.state_dict(), "BERT_NN_5L_ReLU_MC_DO30_weights.pth")

In [None]:
# Example input text
text = "Its acceptable"
encoded_input = bert_tokenize_function(text, bert_tokenizer)

# Extract only input_ids and attention_mask
input_ids = encoded_input["input_ids"]
attention_mask = encoded_input["attention_mask"]

# Forward pass through the model
sentiment_model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    logits = sentiment_model(input_ids=input_ids, attention_mask=attention_mask)

# Apply softmax to get probabilities
probabilities = F.softmax(logits, dim=1)  # Softmax along class dimension

print("Logits:", logits)
print("Probabilities:", probabilities)


# Define class labels and convert probabilities to percentages and map to class names
class_labels = ['Positive', 'Neutral', 'Negative']
probabilities_dict = {class_labels[i]: f"{probabilities[0, i].item() * 100:.2f} %" 
                      for i in range(len(class_labels))}

print(probabilities_dict)

# V. BERT + SVM

In [None]:
# Create an SVM pipeline with scaling and the classifier
svm_BERT = make_pipeline(
    StandardScaler(),  # Standardize features for better performance
    SVC(kernel='rbf', 
        C=1.0, 
        decision_function_shape='ovr')  # Linear kernel, OvR strategy
)

In [None]:
# Fit the model
svm_BERT.fit(X_train, y_train) 

In [None]:
# Preprocess the test sentence
# test_sentence = "This is very negative sentence, omg, I hate it"
# test_sentence = "The price is going down, you will lose all your money"
test_sentence = "Bitcoin price is increasing, well we have it guys, great job, good nice well perfect"
# test_sentence = "omg bro this can't be real, we lost, price is very low, atl, omg"

test_sentence_token_words = clean_tokens(tokenize_function(test_sentence))  # Tokenize and clean
test_sentence_embedding = sentence_to_embedding(test_sentence_token_words, word2vec)  # Get embedding


# Predict the class
predicted_class = svm_model.predict([test_sentence_embedding])[0]
print("Predicted Class:", class_dict[predicted_class])