In [None]:
# !pip install transformers



In [2]:
import torch

In [3]:

sentences = [
    'Today is a sunny day',
    'Today is a rainy day'
]

# Tokenization function
def tokenize(text):
    return text.lower().split()

# Build the vocabulary
def build_vocab(sentences):
    vocab = {}
    for sentence in sentences:
        tokens = tokenize(sentence)
        for token in tokens:
            if token not in vocab:
                vocab[token] = len(vocab) + 1  # starting index from 1, 0 can be used for padding
    return vocab

# Create the vocabulary index
vocab = build_vocab(sentences)

print("Vocabulary Index:", vocab)


Vocabulary Index: {'today': 1, 'is': 2, 'a': 3, 'sunny': 4, 'day': 5, 'rainy': 6}


In [4]:
sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?',
    'I really enjoyed walking in the snow today'
]

# Create the vocabulary index
vocab = build_vocab(sentences)

def text_to_sequence(text, vocab):
    return [vocab.get(token, 0) for token in tokenize(text)]  # 0 for unknown words

def pad_sequences(sequences, maxlen):
    return [seq + [0] * (maxlen - len(seq)) if len(seq) < maxlen else seq[:maxlen] for seq in sequences]

# Example use

for sentence in sentences:
  seq = text_to_sequence(sentence, vocab)
  #print(seq)
  padded_seq = pad_sequences([seq], maxlen=6)  # Example maxlen
  print(padded_seq)

# Unseen darta
test_data = [
    'Today is a snowy day',
    'Will it be rainy tomorrow?'
]

for test_sentence in test_data:
  test_seq = text_to_sequence(test_sentence, vocab)
  print(test_seq)

[[1, 2, 3, 4, 5, 0]]
[[1, 2, 3, 6, 5, 0]]
[[2, 7, 4, 8, 0, 0]]
[[9, 10, 11, 12, 13, 14]]
[1, 2, 3, 0, 5]
[0, 7, 0, 6, 0]


## BERT

In [None]:
from transformers import BertTokenizerFast

sentences = [
    'Today is a sunny day',
    'Today is a rainy day'
]

# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Tokenize the sentences and encode them
encoded_inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# To see the tokens for each input (helpful for understanding the output)
tokens = [tokenizer.convert_ids_to_tokens(ids) for ids in encoded_inputs["input_ids"]]

# To get the word index similar to Keras' tokenizer
word_index = tokenizer.get_vocab()

print("Tokens:", tokens)
print("Token IDs:", encoded_inputs['input_ids'])
print("Word Index:", dict(list(word_index.items())[:10]))  # show only the first 10 for brevity


Tokens: [['[CLS]', 'today', 'is', 'a', 'sunny', 'day', '[SEP]'], ['[CLS]', 'today', 'is', 'a', 'rainy', 'day', '[SEP]']]
Token IDs: tensor([[  101,  2651,  2003,  1037, 11559,  2154,   102],
        [  101,  2651,  2003,  1037, 16373,  2154,   102]])
Word Index: {'protestant': 8330, 'initial': 3988, '##pt': 13876, 'charters': 23010, '243': 22884, 'ref': 25416, '##dies': 18389, '##uchi': 15217, 'sainte': 16947, 'annette': 22521}


# Real Data Sources

## IMDb

In [5]:
import os
import urllib.request
import tarfile

In [7]:
def download_and_extract(url, destination):
    if not os.path.exists(destination):
        os.makedirs(destination, exist_ok=True)
    file_path = os.path.join(destination, "aclImdb_v1.tar.gz")

    if not os.path.exists(file_path):
        print("Downloading the dataset...")
        urllib.request.urlretrieve(url, file_path)
        print("Download complete.")

    if "aclImdb" not in os.listdir(destination):
        print("Extracting the dataset...")
        with tarfile.open(file_path, 'r:gz') as tar:
            tar.extractall(path=destination)
        print("Extraction complete.")

In [8]:

# URL for the dataset
dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
download_and_extract(dataset_url, "/tmp")


Downloading the dataset...
Download complete.
Extracting the dataset...


  tar.extractall(path=destination)


Extraction complete.


In [None]:
from collections import Counter
import os

### Simple tokenizer

In [None]:

# Simple tokenizer
def tokenize(text):
    return text.lower().split()

# Build vocabulary
def build_vocab(path):
    counter = Counter()
    for folder in ["pos", "neg"]:
        folder_path = os.path.join(path, folder)
        for filename in os.listdir(folder_path):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                counter.update(tokenize(file.read()))
    return {word: i+1 for i, word in enumerate(counter)}  # Starting index from 1

vocab = build_vocab("/tmp/aclImdb/train/")

In [12]:
print(vocab)



In [14]:
def text_to_sequence(text, vocab):
    return [vocab.get(token, 0) for token in tokenize(text)]  # 0 for unknown words

def pad_sequences(sequences, maxlen):
    return [seq + [0] * (maxlen - len(seq)) if len(seq) < maxlen else seq[:maxlen] for seq in sequences]

In [None]:
# Example use
text = "This is an example."
seq = text_to_sequence(text, vocab)
padded_seq = pad_sequences([seq], maxlen=256)  # Example maxlens
print(seq)
# print(padded_seq)

[43, 35, 16, 7263]
[[43, 35, 16, 7263, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


### Frequency tokenizer

In [None]:
# Build vocabulary
def build_vocab(path):
    counter = Counter()
    for folder in ["pos", "neg"]:
        folder_path = os.path.join(path, folder)
        for filename in os.listdir(folder_path):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                counter.update(tokenize(file.read()))

    # Sort words by frequency in descending order
    sorted_words = sorted(counter.items(), key=lambda x: x[1], reverse=True)

    # Create vocabulary with indices starting from 1
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(sorted_words)}
    vocab['<pad>'] = 0  # Add padding token with index 0
    return vocab

vocab = build_vocab("/tmp/aclImdb/train/")

In [19]:
# print(vocab)

### Remove stopworkds and HTML

In [21]:
from bs4 import BeautifulSoup

# List of stopwords
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
             "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
             "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having",
             "he", "hed", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how", "hows", "i",
             "id", "ill", "im", "ive", "if", "in", "into", "is", "it", "its", "itself", "lets", "me", "more", "most",
             "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves",
             "out", "over", "own", "same", "she", "shed", "shell", "shes", "should", "so", "some", "such", "than",
             "that", "thats", "the", "their", "theirs", "them", "themselves", "then", "there", "theres", "these", "they",
             "theyd", "theyll", "theyre", "theyve", "this", "those", "through", "to", "too", "under", "until", "up",
             "very", "was", "we", "wed", "well", "were", "weve", "what", "whats", "when", "whens", "where", "wheres",
             "which", "while", "who", "whos", "whom", "why", "whys", "with", "would", "you", "youd", "youll", "youre",
             "youve", "your", "yours", "yourself", "yourselves"]

# Simple tokenizer
def tokenize(text):
    soup = BeautifulSoup(text, "html.parser")
    cleaned_text = soup.get_text()  # Extract text from HTML
    return [word.lower() for word in cleaned_text.split() if word.lower() not in stopwords]

vocab = build_vocab("/tmp/aclImdb/train/")

In [22]:
# print(vocab)

In [23]:
sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?'
]

for sentence in sentences:
  seq = text_to_sequence(sentence, vocab)
  padded_seq = pad_sequences([seq], maxlen=8)  # Example maxlen
  print(padded_seq)


[[1094, 6119, 246, 0, 0, 0, 0, 0]]
[[1094, 6797, 246, 0, 0, 0, 0, 0]]
[[6119, 24689, 0, 0, 0, 0, 0, 0]]


In [24]:
reverse_word_index = dict(
    [(value, key) for (key, value) in vocab.items()])

decoded_review = ' '.join([reverse_word_index.get(i, '?') for i in seq])

print(decoded_review)

sunny today?


# CSV and Sarcasm Data


In [25]:
!wget --no-check-certificate --no-cache \
    https://storage.googleapis.com/learning-datasets/binary-emotion.csv \
    -O /tmp/binary-emotion.csv

--2025-12-04 13:29:09--  https://storage.googleapis.com/learning-datasets/binary-emotion.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.201.27, 142.251.140.251, 172.217.17.27, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.201.27|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2690504 (2.6M) [text/csv]
Saving to: ‘/tmp/binary-emotion.csv’


2025-12-04 13:29:12 (3.56 MB/s) - ‘/tmp/binary-emotion.csv’ saved [2690504/2690504]



In [26]:
from bs4 import BeautifulSoup
import string

stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
             "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
             "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having",
             "he", "hed", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how",
             "hows", "i", "id", "ill", "im", "ive", "if", "in", "into", "is", "it", "its", "itself",
             "lets", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought",
             "our", "ours", "ourselves", "out", "over", "own", "same", "she", "shed", "shell", "shes", "should",
             "so", "some", "such", "than", "that", "thats", "the", "their", "theirs", "them", "themselves", "then",
             "there", "theres", "these", "they", "theyd", "theyll", "theyre", "theyve", "this", "those", "through",
             "to", "too", "under", "until", "up", "very", "was", "we", "wed", "well", "were", "weve", "were",
             "what", "whats", "when", "whens", "where", "wheres", "which", "while", "who", "whos", "whom", "why",
             "whys", "with", "would", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself",
             "yourselves"]

table = str.maketrans('', '', string.punctuation)

In [27]:
import csv
sentences=[]
labels=[]
with open('/tmp/binary-emotion.csv', encoding='UTF-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=",")
    for row in reader:
        labels.append(int(row[0]))
        sentence = row[1].lower()
        sentence = sentence.replace(",", " , ")
        sentence = sentence.replace(".", " . ")
        sentence = sentence.replace("-", " - ")
        sentence = sentence.replace("/", " / ")
        soup = BeautifulSoup(sentence)
        sentence = soup.get_text()
        words = sentence.split()
        filtered_sentence = ""
        for word in words:
            word = word.translate(table)
            if word not in stopwords:
                filtered_sentence = filtered_sentence + word + " "
        sentences.append(filtered_sentence)


In [28]:
print(len(sentences))

35327


In [29]:
# split data in train and test samples
training_size = 28000

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [30]:
from collections import Counter

# Assuming the tokenize function is defined elsewhere
def tokenize(text):
    # Tokenization logic, removing HTML and stopwords as discussed earlier
    soup = BeautifulSoup(text, "html.parser")
    cleaned_text = soup.get_text()
    tokens = cleaned_text.lower().split()
    filtered_tokens = [token for token in tokens if token not in stopwords]
    return filtered_tokens

def build_vocab(sentences):
    counter = Counter()
    for text in sentences:
        counter.update(tokenize(text))

    # Sort words by frequency in descending order
    sorted_words = sorted(counter.items(), key=lambda x: x[1], reverse=True)

    # Create vocabulary with indices starting from 1
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(sorted_words)}
    vocab['<pad>'] = 0  # Add padding token with index 0
    return vocab


vocab = build_vocab(training_sentences)
print(vocab)



In [31]:
def text_to_sequence(text, vocab):
    return [vocab.get(token, 0) for token in tokenize(text)]  # 0 for unknown words

def pad_sequences(sequences, maxlen):
    return [seq + [0] * (maxlen - len(seq)) if len(seq) < maxlen else seq[:maxlen] for seq in sequences]


print(testing_sentences[1])
seq = text_to_sequence(testing_sentences[1], vocab)
print(seq)

made many new friends twitter around usa another bike across usa trip amazing see people 
[146, 259, 30, 110, 53, 198, 2161, 111, 752, 970, 2161, 407, 217, 26, 73]
