In [36]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sfip2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sfip2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Load data from CSV file
data = pd.read_csv("input/twitter_sentiment_data.csv")

# Define features
message_text = data["message"]
sentiment_value = (data["sentiment"]>0.5).astype(int)

In [5]:
# Tokenization
tokenizer = get_tokenizer("basic_english")  # Basic tokenizer for English text
tokenized_data = [tokenizer(text.lower()) for text in message_text]

In [32]:
def preprocess_text_sklearn(text):

    # Define a regular expression pattern to match "RT" at the beginning of the string followed by any characters up to ":"
    pattern = r'^RT.*?:'

    # Use re.sub() to replace the matched pattern with an empty string
    text = re.sub(pattern, '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    return text

In [33]:
# Load stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from tokenized data
filtered_data = []

# Print example before
print("Before data processing and stemming:", tokenized_data[0])
tokenized_data[0][0] = "running"

for sentence in tokenized_data:
    # Remove stopwords & empty string from the sentence
    filtered_sentence = [preprocess_text_sklearn(word) for word in sentence if word.lower() not in stop_words]
    filtered_sentence = [word for word in filtered_sentence if word.strip() != ""]
    filtered_data.append(filtered_sentence)

# Print example after 
print("After data processing and stemming:", filtered_data[0])

Before data processing and stemming: ['running', 'climate', 'change', 'is', 'an', 'interesting', 'hustle', 'as', 'it', 'was', 'global', 'warming', 'but', 'the', 'planet', 'stopped', 'warming', 'for', '15', 'yes', 'while', 'the', 'suv', 'boom']
After data processing and stemming: ['running', 'climate', 'change', 'interesting', 'hustle', 'global', 'warming', 'planet', 'stopped', 'warming', 'yes', 'suv', 'boom']


In [None]:
# Hyperparameters
embedding_name = 'glove'  # or 'word2vec'
embedding_dim = 100
num_filters = 100
filter_sizes = [3, 4, 5]
output_dim = 1
dropout = 0.5
learning_rate = 0.001
num_epochs = 10

In [37]:
# Word Embedding
import torchtext.vocab as vocab

# Load pre-trained GloVe embeddings
glove = vocab.GloVe(name='6B', dim=100)

# Convert tokenized/filtered data to embeddings
embedded_data = [[glove[word] for word in sentence if word in glove.stoi] for sentence in filtered_data]

# Print Example
print("Example embedded data:", embedded_data[0])

.vector_cache\glove.6B.zip: 0.00B [00:00, ?B/s]

.vector_cache\glove.6B.zip: 862MB [03:04, 4.67MB/s]                               
100%|█████████▉| 399999/400000 [00:22<00:00, 17649.78it/s]


Example embedded data: [tensor([ 0.2349,  0.1522,  0.1735, -1.0908,  0.6519,  0.3014,  0.1010,  0.4653,
        -0.2558, -0.2544,  0.1938,  0.3013,  0.3270, -0.1221, -0.0589, -0.6082,
        -0.1679,  0.3410,  0.0249, -0.3581,  0.2928,  0.8181,  0.5304, -0.2610,
         0.0748, -0.0260, -0.8331, -0.1931,  0.1522,  0.5277, -0.2359,  0.3234,
         0.2630, -0.1916,  0.3356,  0.3917, -0.4810, -0.0099,  0.0861,  0.8557,
         0.0923, -0.1227,  0.1656, -0.2582, -0.3056,  0.1681,  0.2267, -0.9251,
         0.1304, -0.3469,  0.0930,  0.3256,  0.0779,  1.2362,  0.3093, -2.6562,
        -0.0140,  0.4263,  1.3273,  0.2038, -0.4049,  0.1124,  0.1576,  0.4148,
         0.9911,  0.7153,  1.0092, -0.2155,  0.2553,  0.9857, -0.0460, -0.0483,
        -0.2066, -0.0414,  0.0140, -0.0450, -0.2145,  0.7718, -0.0933, -0.3504,
         0.5339,  0.1501, -0.5803,  0.0945, -1.1608,  0.1794, -0.0718,  0.0655,
         0.0667, -0.4064, -0.2691,  0.2845, -0.4789,  0.1755, -0.4497, -0.2814,
         0.0990,