In [7]:
import os
import pandas as pd
import re
import numpy as np
from multiprocessing import Pool, cpu_count
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from transformers import AutoTokenizer, BertForSequenceClassification, BertTokenizer
from xgboost import XGBClassifier



In [None]:
# !pip install spacy
# !python3 -m spacy download en_core_web_sm
# !pip install xgboost
# ! pip install transformers

In [8]:
# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

if not os.path.exists('../data/processed_data.csv'):
    
    df_raw = pd.read_csv('../data/Sentiment140.tenPercent.sample.tweets.tsv', sep='\t')


    # Load a pre-trained tokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    def preprocess_text(text):
        # Remove URLs
        text = re.sub(r'http\S+', '', text)
        # Remove TAGs
        text = re.sub(r'@\w+', '', text)
        # Replace more than one space with a single space
        text = re.sub(r'\s+', ' ', text)
        # If text is an empty string, change it to a space
        text = text if text != '' else ' '

        # Tokenize using Hugging Face tokenizer
        tokens = tokenizer.encode(text, add_special_tokens=True)

        # Convert tokens to string and remove special tokens
        preprocessed_text = tokenizer.decode(tokens, skip_special_tokens=True)

        return preprocessed_text

    def preprocess_wrapper(args):
        index, row = args
        row['tweet_text_processed'] = preprocess_text(row['tweet_text'])
        return row

    # Apply preprocess_text() to each row using multiprocessing
    num_processes = cpu_count()  # Number of CPU cores
    print(f'{num_processes} cores are using to process the data for accelerating processing time.')

    with Pool(processes=num_processes) as pool:
        result = list(tqdm(pool.imap(preprocess_wrapper, df_raw.iterrows()), total=len(df_raw)))

    # Convert the list of processed rows back to a DataFrame
    df = pd.DataFrame(result)
    df.to_csv('../data/processed_data.csv', index=False)
else:
    df =  pd.read_csv('../data/processed_data.csv')

# Print the original and processed tweet content for the first 10 samples
for index, row in df.head(10).iterrows():
    print(f"For {index} sample, the original  tweet content is: {row['tweet_text']}")
    print(f"For {index} sample, the processed tweet content is: {row['tweet_text_processed']}")

For 0 sample, the original  tweet content is: @elephantbird Hey dear, Happy Friday to You  Already had your rice's bowl for lunch ?
For 0 sample, the processed tweet content is: hey dear, happy friday to you already had your rice's bowl for lunch?
For 1 sample, the original  tweet content is: Ughhh layin downnnn    Waiting for zeina to cook breakfast
For 1 sample, the processed tweet content is: ughhh layin downnnn waiting for zeina to cook breakfast
For 2 sample, the original  tweet content is: @greeniebach I reckon he'll play, even if he's not 100%...but i know nothing!! ;) It won't be the same without him. 
For 2 sample, the processed tweet content is: i reckon he'll play, even if he's not 100 %... but i know nothing!! ; ) it won't be the same without him.
For 3 sample, the original  tweet content is: @vaLewee I know!  Saw it on the news!
For 3 sample, the processed tweet content is: i know! saw it on the news!
For 4 sample, the original  tweet content is: very sad that http://www.f

In [9]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 159733 entries, 0 to 159999
Data columns (total 3 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   sentiment_label       159733 non-null  int64 
 1   tweet_text            159733 non-null  object
 2   tweet_text_processed  159733 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.9+ MB


In [10]:
# Assuming you have df with 'tweet_text_processed' and 'binary_sentiment' columns
X_train, X_test, y_train, y_test = train_test_split(df['tweet_text_processed'], df['sentiment_label'], test_size=0.2, random_state=42)

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Use CountVectorizer to convert text to numerical features
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

## Regular Methods:

In [11]:
def test_and_evaluate_models(X_train_vectorized, X_test_vectorized, y_train_encoded, y_test_encoded):
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Multinomial Naive Bayes': MultinomialNB(),
        # 'Support Vector Machine': SVC(kernel='linear'),
        # 'Random Forest': RandomForestClassifier(),
        # 'XGBoost': XGBClassifier()
    }

    for model_name, model in models.items():
        print(f"Training and evaluating {model_name}...")
        model.fit(X_train_vectorized, y_train_encoded)
        predictions = model.predict(X_test_vectorized)

        accuracy = accuracy_score(y_test_encoded, predictions)
        print(f'Accuracy for {model_name}: {accuracy}')
        
        print(f'Classification Report for {model_name}:\n{classification_report(y_test_encoded, predictions)}')
        print('-' * 50)

# Assuming X_train, X_test, y_train, and y_test are already defined
test_and_evaluate_models(X_train_vectorized, X_test_vectorized, y_train_encoded, y_test_encoded)

Training and evaluating Logistic Regression...
Accuracy for Logistic Regression: 0.782139167996995
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.78      0.77      0.78     15865
           1       0.78      0.79      0.79     16082

    accuracy                           0.78     31947
   macro avg       0.78      0.78      0.78     31947
weighted avg       0.78      0.78      0.78     31947

--------------------------------------------------
Training and evaluating Multinomial Naive Bayes...
Accuracy for Multinomial Naive Bayes: 0.7705574858359158
Classification Report for Multinomial Naive Bayes:
              precision    recall  f1-score   support

           0       0.75      0.80      0.78     15865
           1       0.79      0.74      0.76     16082

    accuracy                           0.77     31947
   macro avg       0.77      0.77      0.77     31947
weighted avg       0.77      0.77      0.77  

## Regular NN

In [None]:
# Convert labels to PyTorch tensors
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

# Define a simple feedforward neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

# Set hyperparameters
input_size = X_train_vectorized.shape[1]
hidden_size = 64
output_size = 2  

# Initialize the model, loss function, and optimizer
model = SimpleNN(input_size, hidden_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100
# Initialize tqdm with the total number of epochs
progress_bar = tqdm(total=num_epochs)

for epoch in tqdm(range(num_epochs)):
    # Convert input data to PyTorch tensor
    X_train_tensor = torch.tensor(X_train_vectorized.toarray(), dtype=torch.float32)

    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Evaluate on training set
    model.eval()
    with torch.no_grad():
        train_outputs = model(X_train_tensor)
        train_predictions = torch.argmax(train_outputs, dim=1).numpy()

    # Calculate metrics for training set
    train_accuracy = accuracy_score(y_train.numpy(), train_predictions)
    if (epoch + 1) % 5 == 0:
        # Print or log the metrics for training set
        progress_bar.update(5)
        print(f'Epoch {epoch + 1}/{num_epochs}: Training Accuracy: {train_accuracy}')

# Evaluate on test set
model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test_vectorized.toarray(), dtype=torch.float32)
    test_outputs = model(X_test_tensor)
    test_predictions = torch.argmax(test_outputs, dim=1).numpy()

print(classification_report(y_test, test_predictions))

## BERT

In [6]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(type(X_train), type(X_test) , type(y_train), type(y_test))

(128000,) (32000,) (128000,) (32000,)
<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>


In [7]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set init_weights to False
model.init_weights = False

# Move model to CUDA
model = model.to('cuda')

# Tokenize and pad sequences
X_train_tokens = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors='pt')
X_test_tokens = tokenizer(X_test.tolist(), padding=True, truncation=True, return_tensors='pt')

# Convert data to PyTorch tensors
train_dataset = TensorDataset(X_train_tokens['input_ids'], 
                              X_train_tokens['attention_mask'], 
                              torch.tensor(y_train.values).to('cuda'))
test_dataset = TensorDataset(X_test_tokens['input_ids'], 
                             X_test_tokens['attention_mask'], 
                             torch.tensor(y_test.values).to('cuda'))

# Create data loaders
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Move optimizer to CUDA manually
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
for state in optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.to('cuda')

criterion = torch.nn.CrossEntropyLoss()

num_epochs = 20

# Initialize tqdm with the total number of epochs
progress_bar = tqdm(total=num_epochs)

for epoch in tqdm(range(num_epochs)):
    model.train()
    for batch in train_dataloader:
        inputs, attention_mask, labels = batch
        # Move data to CUDA
        inputs, attention_mask, labels = inputs.to('cuda'), attention_mask.to('cuda'), labels.to('cuda')

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 5 == 0:
        # Print or log the metrics for the training set
        progress_bar.update(5)
        print(f'Epoch {epoch + 1}/{num_epochs}: Training Accuracy: {train_accuracy}')

# Evaluate on the test set
model.eval()
with torch.no_grad():
    for batch in test_dataloader:
        inputs, attention_mask, labels = batch
        # Move data to CUDA
        inputs, attention_mask, labels = inputs.to('cuda'), attention_mask.to('cuda'), labels.to('cuda')

        outputs = model(inputs, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy for BERT: {accuracy}')
print(f'Classification Report for BERT:\n{classification_report(y_test, predictions)}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 10%|█         | 2/20 [45:12<6:46:53, 1356.30s/it]


KeyboardInterrupt: 

## LSTM

In [12]:

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_vectorized.toarray(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_vectorized.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.float32)

# Create DataLoader for training and testing sets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take the output of the last time step
        output = self.fc(lstm_out)
        output = self.sigmoid(output)
        return output

# Instantiate the model, loss function, and optimizer
input_size = X_train_tensor.size(1)
hidden_size = 100
output_size = 1
model = LSTMModel(input_size, hidden_size, output_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs.long())
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix({'Loss': loss.item()})

# Evaluate the model on the test set
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for inputs, labels in tqdm(test_loader, desc='Testing', leave=False):
        outputs = model(inputs.long())
        predicted = (outputs.squeeze() > 0.5).float()
        predictions.extend(predicted.numpy())
        true_labels.extend(labels.numpy())

# Convert predictions to binary values
predictions = (np.array(predictions) > 0.5).astype(int)

# Compute confusion matrix and classification report
conf_matrix = confusion_matrix(true_labels, predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Display classification report
print("\nClassification Report:")
print(classification_report(true_labels, predictions))


Epoch 1/10:   0%|          | 0/3994 [47:15<?, ?it/s, Loss=0.694]