In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm  # Import tqdm for the progress bar
import spacy
import re
from multiprocessing import Pool, cpu_count
import os

In [45]:
# !pip install spacy
# !python3 -m spacy download en_core_web_sm

In [67]:
if not os.path.exists('../data/processed_data.csv'):
    
    df_raw = pd.read_csv('../data/Sentiment140.tenPercent.sample.tweets.tsv', sep='\t')


    # Load spaCy English model
    nlp = spacy.load("en_core_web_sm")

    # Function to preprocess text
    def preprocess_text(text):
        # Parse the text using spaCy
        doc = nlp(text)
        
        # Lemmatize each token and remove stop words and punctuation
        preprocessed_text = " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
        
        # Remove URLs
        preprocessed_text = re.sub(r'http\S+', '', preprocessed_text)
        # Remove TAGs
        preprocessed_text = re.sub(r'@\w+', '', preprocessed_text)
        
        # Replace more than one space with a single space
        preprocessed_text = re.sub(r'\s+', ' ', preprocessed_text)
        
        # If preprocessed_text is an empty string, change it to a space
        preprocessed_text = preprocessed_text if preprocessed_text != '' else ' '
        
        return preprocessed_text

    def preprocess_wrapper(args):
        index, row = args
        row['tweet_text_processed'] = preprocess_text(row['tweet_text'])
        return row

    # Apply preprocess_text() to each row using multiprocessing
    num_processes = cpu_count()  # Number of CPU cores
    print(f'{num_processes} cores are using to process the data for accelerating processing time.')

    with Pool(processes=num_processes) as pool:
        result = list(tqdm(pool.imap(preprocess_wrapper, df_raw.iterrows()), total=len(df_raw)))

    # Convert the list of processed rows back to a DataFrame
    df = pd.DataFrame(result)
    df.to_csv('../data/processed_data.csv', index=False)
else:
    df =  pd.read_csv('../data/processed_data.csv')

# Print the original and processed tweet content for the first 10 samples
for index, row in df.head(10).iterrows():
    print(f"For {index} sample, the original  tweet content is: {row['tweet_text']}")
    print(f"For {index} sample, the processed tweet content is: {row['tweet_text_processed']}")

For 0 sample, the original  tweet content is: @elephantbird Hey dear, Happy Friday to You  Already had your rice's bowl for lunch ?
For 0 sample, the processed tweet content is:  hey dear Happy Friday rice bowl lunch
For 1 sample, the original  tweet content is: Ughhh layin downnnn    Waiting for zeina to cook breakfast
For 1 sample, the processed tweet content is: ughhh layin downnnn wait zeina cook breakfast
For 2 sample, the original  tweet content is: @greeniebach I reckon he'll play, even if he's not 100%...but i know nothing!! ;) It won't be the same without him. 
For 2 sample, the processed tweet content is:  reckon play 100% know will
For 3 sample, the original  tweet content is: @vaLewee I know!  Saw it on the news!
For 3 sample, the processed tweet content is:  know see news
For 4 sample, the original  tweet content is: very sad that http://www.fabchannel.com/ has closed down. One of the few web services that I've used for over 5 years 
For 4 sample, the processed tweet conte

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160000 entries, 0 to 159999
Data columns (total 3 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   sentiment_label       160000 non-null  int64 
 1   tweet_text            160000 non-null  object
 2   tweet_text_processed  160000 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.7+ MB


In [69]:
df['binary_sentiment'] = df['sentiment_label'].apply(lambda x: 1 if x == 4 else 0)
# Count the frequency of each unique value in the 'sentiment_label' column
frequency_count = df['binary_sentiment'].value_counts()
# Print the result
print(frequency_count)

binary_sentiment
1    80000
0    80000
Name: count, dtype: int64


In [70]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(df['tweet_text_processed'], df['binary_sentiment'], test_size=0.2, random_state=42)
print(X_train.isnull().sum())

# Convert text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


0


In [71]:
# Train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

# Make predictions
predictions = nb_classifier.predict(X_test_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, predictions))

Accuracy: 0.750875
              precision    recall  f1-score   support

           0       0.75      0.76      0.75     16002
           1       0.76      0.74      0.75     15998

    accuracy                           0.75     32000
   macro avg       0.75      0.75      0.75     32000
weighted avg       0.75      0.75      0.75     32000



In [72]:
# Convert labels to PyTorch tensors
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

# Define a simple feedforward neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Set hyperparameters
input_size = X_train_vectorized.shape[1]
hidden_size = 64
output_size = 2  

# Initialize the model, loss function, and optimizer
model = SimpleNN(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
# Initialize tqdm with the total number of epochs
progress_bar = tqdm(total=num_epochs)

for epoch in tqdm(range(num_epochs)):
    # Convert input data to PyTorch tensor
    X_train_tensor = torch.tensor(X_train_vectorized.toarray(), dtype=torch.float32)

    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Evaluate on training set
    model.eval()
    with torch.no_grad():
        train_outputs = model(X_train_tensor)
        train_predictions = torch.argmax(train_outputs, dim=1).numpy()

    # Calculate metrics for training set
    train_accuracy = accuracy_score(y_train.numpy(), train_predictions)
    if (epoch + 1) % 5 == 0:
        # Print or log the metrics for training set
        progress_bar.update(5)
        print(f'Epoch {epoch + 1}/{num_epochs}: Training Accuracy: {train_accuracy}')

# Evaluate on test set
model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test_vectorized.toarray(), dtype=torch.float32)
    test_outputs = model(X_test_tensor)
    test_predictions = torch.argmax(test_outputs, dim=1).numpy()

print(classification_report(y_test, test_predictions))

100%|██████████| 20/20 [29:07<00:00, 87.40s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

 25%|██▌       | 5/20 [00:47<02:23,  9.57s/it]

Epoch 5/20: Training Accuracy: 0.7592109375


 50%|█████     | 10/20 [01:35<01:35,  9.59s/it]

Epoch 10/20: Training Accuracy: 0.8000703125


 75%|███████▌  | 15/20 [02:25<00:49,  9.84s/it]

Epoch 15/20: Training Accuracy: 0.80275


100%|██████████| 20/20 [03:14<00:00,  9.75s/it]


Epoch 20/20: Training Accuracy: 0.805828125
              precision    recall  f1-score   support

           0       0.73      0.75      0.74     16002
           1       0.74      0.72      0.73     15998

    accuracy                           0.73     32000
   macro avg       0.73      0.73      0.73     32000
weighted avg       0.73      0.73      0.73     32000



In [60]:
df_1 = pd.read_csv('../data/Sentiment140.tenPercent.sample.tweets.tsv', sep='\t')
df_2 =  pd.read_csv('../data/processed_data.csv')
print(df_1.isnull().sum())
print(df_2.isnull().sum())

sentiment_label    0
tweet_text         0
dtype: int64
sentiment_label           0
tweet_text                0
tweet_text_processed    914
dtype: int64
