In [33]:
import pandas as pd

In [34]:
df = pd.read_csv('../data/Sentiment140.tenPercent.sample.tweets.tsv', sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160000 entries, 0 to 159999
Data columns (total 2 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   sentiment_label  160000 non-null  int64 
 1   tweet_text       160000 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [35]:
# Count the frequency of each unique value in the 'sentiment_label' column
frequency_count = df['sentiment_label'].value_counts()

# Print the result
print(frequency_count)

sentiment_label
4    80000
0    80000
Name: count, dtype: int64


In [36]:
df['binary_sentiment'] = df['sentiment_label'].apply(lambda x: 1 if x == 4 else 0)

# Count the frequency of each unique value in the 'sentiment_label' column
frequency_count = df['binary_sentiment'].value_counts()

# Print the result
print(frequency_count)

binary_sentiment
1    80000
0    80000
Name: count, dtype: int64


In [37]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Assuming df is your DataFrame
X_train, X_test, y_train, y_test = train_test_split(df['tweet_text'], df['binary_sentiment'], test_size=0.2, random_state=42)

# Convert text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

# Make predictions
predictions = nb_classifier.predict(X_test_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, predictions))


Accuracy: 0.76621875
              precision    recall  f1-score   support

           0       0.74      0.82      0.78     16002
           1       0.80      0.72      0.75     15998

    accuracy                           0.77     32000
   macro avg       0.77      0.77      0.77     32000
weighted avg       0.77      0.77      0.77     32000



In [42]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm  # Import tqdm for the progress bar

# Assuming df is your DataFrame
X_train, X_test, y_train, y_test = train_test_split(df['tweet_text'], df['binary_sentiment'], test_size=0.2, random_state=42)

# Convert text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Convert labels to PyTorch tensors
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

# Define a simple feedforward neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Set hyperparameters
input_size = X_train_vectorized.shape[1]
hidden_size = 64
output_size = 2  # Assuming binary classification (0 and 1)

# Initialize the model, loss function, and optimizer
model = SimpleNN(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
for epoch in tqdm(range(num_epochs)):
    # Convert input data to PyTorch tensor
    X_train_tensor = torch.tensor(X_train_vectorized.toarray(), dtype=torch.float32)

    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Evaluate on training set
    model.eval()
    with torch.no_grad():
        train_outputs = model(X_train_tensor)
        train_predictions = torch.argmax(train_outputs, dim=1).numpy()

    # Calculate metrics for training set
    train_accuracy = accuracy_score(y_train.numpy(), train_predictions)
    train_precision = precision_score(y_train.numpy(), train_predictions)
    train_recall = recall_score(y_train.numpy(), train_predictions)
    train_f1 = f1_score(y_train.numpy(), train_predictions)

    # Print or log the metrics for training set
    print(f'Epoch {epoch + 1}/{num_epochs}: Training Accuracy: {train_accuracy}, Precision: {train_precision}, Recall: {train_recall}, F1: {train_f1}')

# Evaluate on test set
model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test_vectorized.toarray(), dtype=torch.float32)
    test_outputs = model(X_test_tensor)
    test_predictions = torch.argmax(test_outputs, dim=1).numpy()

# Calculate metrics for test set for each class
test_accuracy = accuracy_score(y_test.numpy(), test_predictions)
test_precision_0 = precision_score(y_test.numpy(), test_predictions, pos_label=0)
test_recall_0 = recall_score(y_test.numpy(), test_predictions, pos_label=0)
test_f1_0 = f1_score(y_test.numpy(), test_predictions, pos_label=0)
test_support_0 = len(y_test[y_test == 0])

test_precision_1 = precision_score(y_test.numpy(), test_predictions, pos_label=1)
test_recall_1 = recall_score(y_test.numpy(), test_predictions, pos_label=1)
test_f1_1 = f1_score(y_test.numpy(), test_predictions, pos_label=1)
test_support_1 = len(y_test[y_test == 1])

# Print or log the metrics for test set for each class
print(f'Test Accuracy: {test_accuracy}')
print(f'Class 0 - Precision: {test_precision_0}, Recall: {test_recall_0}, F1: {test_f1_0}, Support: {test_support_0}')
print(f'Class 1 - Precision: {test_precision_1}, Recall: {test_recall_1}, F1: {test_f1_1}, Support: {test_support_1}')


  5%|███▌                                                                  | 1/20 [00:18<05:57, 18.81s/it]

Epoch 1/20: Training Accuracy: 0.499984375, Precision: 0.5, Recall: 1.5624511734008312e-05, F1: 3.1248046997062684e-05


 10%|███████                                                               | 2/20 [00:37<05:40, 18.90s/it]

Epoch 2/20: Training Accuracy: 0.515375, Precision: 0.9772286821705426, Recall: 0.031514640167494765, F1: 0.06106015196924288


 15%|██████████▌                                                           | 3/20 [00:56<05:21, 18.91s/it]

Epoch 3/20: Training Accuracy: 0.5994765625, Precision: 0.9595150465468716, Recall: 0.20774350801537453, F1: 0.3415404770161446


 20%|██████████████                                                        | 4/20 [01:16<05:05, 19.09s/it]

Epoch 4/20: Training Accuracy: 0.6842421875, Precision: 0.9420474566105634, Recall: 0.3926596043873629, F1: 0.5542848950694207


 25%|█████████████████▌                                                    | 5/20 [01:35<04:45, 19.04s/it]

Epoch 5/20: Training Accuracy: 0.744890625, Precision: 0.9286026797921794, Recall: 0.5305927939751883, F1: 0.6753171856978085


 30%|█████████████████████                                                 | 6/20 [01:53<04:26, 19.01s/it]

Epoch 6/20: Training Accuracy: 0.7864765625, Precision: 0.9159501826183617, Recall: 0.6308552857723196, F1: 0.7471295208312131


 35%|████████████████████████▌                                             | 7/20 [02:13<04:07, 19.03s/it]

Epoch 7/20: Training Accuracy: 0.810375, Precision: 0.9047803406960633, Recall: 0.6937751945251711, F1: 0.7853517041334301


 40%|████████████████████████████                                          | 8/20 [02:32<03:50, 19.25s/it]

Epoch 8/20: Training Accuracy: 0.8239453125, Precision: 0.8960932276244149, Recall: 0.7328833473953938, F1: 0.8063121524405441


 45%|███████████████████████████████▌                                      | 9/20 [02:51<03:30, 19.13s/it]

Epoch 9/20: Training Accuracy: 0.830703125, Precision: 0.8884525033034797, Recall: 0.7563826130433424, F1: 0.8171153683855179


 50%|██████████████████████████████████▌                                  | 10/20 [03:10<03:10, 19.09s/it]

Epoch 10/20: Training Accuracy: 0.8360390625, Precision: 0.8827297802295578, Recall: 0.7750539045654823, F1: 0.8253949765801143


 55%|█████████████████████████████████████▉                               | 11/20 [03:29<02:51, 19.08s/it]

Epoch 11/20: Training Accuracy: 0.8389140625, Precision: 0.8779204488039445, Recall: 0.7873191462766789, F1: 0.8301551083616834


 60%|█████████████████████████████████████████▍                           | 12/20 [03:49<02:34, 19.36s/it]

Epoch 12/20: Training Accuracy: 0.84115625, Precision: 0.8741047870335469, Recall: 0.7971313396456361, F1: 0.8338454497907949


 65%|████████████████████████████████████████████▊                        | 13/20 [04:08<02:15, 19.34s/it]

Epoch 13/20: Training Accuracy: 0.8429375, Precision: 0.8710735418427726, Recall: 0.8050373425830443, F1: 0.8367545797063791


 70%|████████████████████████████████████████████████▎                    | 14/20 [04:28<01:55, 19.31s/it]

Epoch 14/20: Training Accuracy: 0.8441328125, Precision: 0.8682639736494507, Recall: 0.8113808943470516, F1: 0.838859228986116


 75%|███████████████████████████████████████████████████▊                 | 15/20 [04:47<01:36, 19.28s/it]

Epoch 15/20: Training Accuracy: 0.845125, Precision: 0.8655787627023269, Recall: 0.8171619636886347, F1: 0.8406738249855334


 80%|███████████████████████████████████████████████████████▏             | 16/20 [05:06<01:16, 19.25s/it]

Epoch 16/20: Training Accuracy: 0.846421875, Precision: 0.8641559636041126, Recall: 0.8220836848848474, F1: 0.8425949650887196


 85%|██████████████████████████████████████████████████████████▋          | 17/20 [05:25<00:57, 19.19s/it]

Epoch 17/20: Training Accuracy: 0.847640625, Precision: 0.8630045355173427, Recall: 0.8264897971938376, F1: 0.8443525731068829


 90%|██████████████████████████████████████████████████████████████       | 18/20 [05:44<00:38, 19.19s/it]

Epoch 18/20: Training Accuracy: 0.848890625, Precision: 0.8625353118810274, Recall: 0.8300834348926596, F1: 0.8459982802000064


 95%|█████████████████████████████████████████████████████████████████▌   | 19/20 [06:03<00:19, 19.12s/it]

Epoch 19/20: Training Accuracy: 0.849796875, Precision: 0.8621247412008282, Recall: 0.8327864754226431, F1: 0.8472016912243893


100%|█████████████████████████████████████████████████████████████████████| 20/20 [06:23<00:00, 19.16s/it]

Epoch 20/20: Training Accuracy: 0.850890625, Precision: 0.8625415684628548, Recall: 0.8348332864597982, F1: 0.8484612697303648





Test Accuracy: 0.75021875
Class 0 - Precision: 0.7428588756140457, Recall: 0.765466816647919, F1: 0.7539934135606783, Support: 16002
Class 1 - Precision: 0.7580426793888209, Recall: 0.7349668708588574, F1: 0.7463264464121362, Support: 15998
