In [1]:
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
import matplotlib.pyplot as plt

# Check for TensorFlow GPU access
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

# See TensorFlow version: working on tensorflow-macos: 2.9.0, tensorflow-metal: 0.5.0 (https://developer.apple.com/metal/tensorflow-plugin/)
print(f"TensorFlow version: {tf.__version__}") 

from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow_datasets as tfds 

TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.9.0


In [2]:
sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?'
]

tokenizer = Tokenizer(num_words = 100) 
tokenizer.fit_on_texts(sentences) 
word_index = tokenizer.word_index 
print(word_index)

sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

{'today': 1, 'is': 2, 'a': 3, 'sunny': 4, 'day': 5, 'rainy': 6, 'it': 7}
[[1, 2, 3, 4, 5], [1, 2, 3, 6, 5], [2, 7, 4, 1]]


In [3]:
# seeing how a pretrained network (above) views new sentences with words that weren't tokenized
# below the words 'snowy', 'will', 'be', 'tomorrow' arent in the training set above in 'sentences'
test_data = [
        'Today is a snowy day',
        'Will it be rainy tomorrow?'
]

test_sequences = tokenizer.texts_to_sequences(test_data) 
print(word_index)
print(test_sequences)

{'today': 1, 'is': 2, 'a': 3, 'sunny': 4, 'day': 5, 'rainy': 6, 'it': 7}
[[1, 2, 3, 5], [7, 6]]


It views the above two sentences as 'today is a day' and 'it rainy'. This is why we need out-of-vocabulary tokens.

In [4]:
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>") # added oov_token
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)

test_sequences = tokenizer.texts_to_sequences(test_data) 
print(word_index)
print(test_sequences)

{'<OOV>': 1, 'today': 2, 'is': 3, 'a': 4, 'sunny': 5, 'day': 6, 'rainy': 7, 'it': 8}
[[2, 3, 4, 1, 6], [1, 8, 1, 7, 1]]


Now this reads 'today is a <_OOV_> day' and '<_OOV_> it <_OOV_> rainy <_OOV_>'

In [5]:
sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?',
    'I really enjoyed walking in the snow today'
]

tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>") 
tokenizer.fit_on_texts(sentences) 
word_index = tokenizer.word_index 
print(word_index)

sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

{'<OOV>': 1, 'today': 2, 'is': 3, 'a': 4, 'sunny': 5, 'day': 6, 'rainy': 7, 'it': 8, 'i': 9, 'really': 10, 'enjoyed': 11, 'walking': 12, 'in': 13, 'the': 14, 'snow': 15}
[[2, 3, 4, 5, 6], [2, 3, 4, 7, 6], [3, 8, 5, 2], [9, 10, 11, 12, 13, 14, 15, 2]]


Similar to images, sentences also need to be of the same length. 

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# prepadding (default option)
padded = pad_sequences(sequences)
print(padded)

[[ 0  0  0  2  3  4  5  6]
 [ 0  0  0  2  3  4  7  6]
 [ 0  0  0  0  3  8  5  2]
 [ 9 10 11 12 13 14 15  2]]


In [7]:
# post paddding
padded = pad_sequences(sequences, padding='post')
print(padded)

[[ 2  3  4  5  6  0  0  0]
 [ 2  3  4  7  6  0  0  0]
 [ 3  8  5  2  0  0  0  0]
 [ 9 10 11 12 13 14 15  2]]


In [8]:
# setting a max length
padded = pad_sequences(sequences, padding='post', maxlen=6) 
print(padded)

[[ 2  3  4  5  6  0]
 [ 2  3  4  7  6  0]
 [ 3  8  5  2  0  0]
 [11 12 13 14 15  2]]


In [9]:
# truncating settings
padded = pad_sequences(sequences, padding='post', maxlen=6, truncating='post')
print(padded)

[[ 2  3  4  5  6  0]
 [ 2  3  4  7  6  0]
 [ 3  8  5  2  0  0]
 [ 9 10 11 12 13 14]]


In [10]:
from bs4 import BeautifulSoup 

sentence = 'Today is a sunny day'

soup = BeautifulSoup(sentence) 
sentence = soup.get_text()

In [11]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [12]:
words = sentence.split() 
# sentences = []
filtered_sentence = "" 
for word in words:
    if word not in stopwords:
        filtered_sentence = filtered_sentence + word + " "
sentences.append(filtered_sentence)

In [13]:
print(sentences)

['Today is a sunny day', 'Today is a rainy day', 'Is it sunny today?', 'I really enjoyed walking in the snow today', 'Today sunny day ']


In [14]:
imdb_sentences = []
train_data = tfds.as_numpy(tfds.load('imdb_reviews', split="train")) 
for item in train_data:
    imdb_sentences.append(str(item['text']))

Metal device set to: Apple M1


2023-02-06 10:39:56.702412: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-02-06 10:39:56.702960: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-02-06 10:39:56.756439: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [15]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(imdb_sentences)
sequences = tokenizer.texts_to_sequences(imdb_sentences)

In [16]:
# print(tokenizer.word_index)

In [17]:
from bs4 import BeautifulSoup 
import string

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

table = str.maketrans('', '', string.punctuation)

imdb_sentences = []

train_data = tfds.as_numpy(tfds.load('imdb_reviews', split="train")) 
for item in train_data:
    sentence = str(item['text'].decode('UTF-8').lower())
    soup = BeautifulSoup(sentence)
    sentence = soup.get_text()
    sentence = sentence.replace(",", " , ")
    sentence = sentence.replace(".", " . ")
    sentence = sentence.replace("-", " - ")
    sentence = sentence.replace("/", " / ")
    words = sentence.split()
    filtered_sentence = "" 
    for word in words:
        word = word.translate(table) 
        if word not in stopwords:
            filtered_sentence = filtered_sentence + word + " "
    imdb_sentences.append(filtered_sentence)
    
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=25000) 
tokenizer.fit_on_texts(imdb_sentences)
sequences = tokenizer.texts_to_sequences(imdb_sentences)
# print(tokenizer.word_index)



In [18]:
# !jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10 # - had to run this to run below command

In [19]:
print(imdb_sentences)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [20]:
(train_data, test_data), info = tfds.load(
        'imdb_reviews/subwords8k',
        split = (tfds.Split.TRAIN, tfds.Split.TEST),
        as_supervised=True,
        with_info=True
)



In [21]:
encoder = info.features['text'].encoder
print ('Vocabulary size: {}'.format(encoder.vocab_size))

Vocabulary size: 8185


In [22]:
print(encoder.subwords)



In [23]:
sample_string = 'Today is a sunny day'
encoded_string = encoder.encode(sample_string)
print ('Encoded string is {}'.format(encoded_string))

Encoded string is [6427, 4869, 9, 4, 2365, 1361, 606]


In [24]:
print(encoder.subwords[605])

day


In [25]:
encoded_string = encoder.encode(sample_string)
original_string = encoder.decode(encoded_string)
test_string = encoder.decode([6427, 4869, 9, 4, 2365, 1361, 606])
print(test_string)

Today is a sunny day


### Loading Text from CSV Files

In [52]:
import csv
# sentences=[]
# labels=[]
# with open('Twitter_Data.csv', encoding='UTF-8') as csvfile:
#     reader = csv.reader(csvfile, delimiter=",") 
#     for row in reader:
#         labels.append(int(row[0]))
#         sentence = row[1].lower()
#         sentence = sentence.replace(",", " , ")
#         sentence = sentence.replace(".", " . ")
#         sentence = sentence.replace("-", " - ")
#         sentence = sentence.replace("/", " / ")
#         soup = BeautifulSoup(sentence)
#         sentence = soup.get_text()
#         words = sentence.split()
#         filtered_sentence = ""
#         for word in words:
#             word = word.translate(table) 
#             if word not in stopwords:
#                     filtered_sentence = filtered_sentence + word + " "
#                     sentences.append(filtered_sentence)

In [68]:
data = []
for line in open('Sarcasm_Dataset_v2.json', 'r'):
    data.append(json.loads(line))

In [69]:
# data = list(parse_data('Sarcasm_Dataset_v2.json'))

In [70]:
data

[{'is_sarcastic': 1,
  'headline': 'thirtysomething scientists unveil doomsday clock of hair loss',
  'article_link': 'https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205'},
 {'is_sarcastic': 0,
  'headline': 'dem rep. totally nails why congress is falling short on gender, racial equality',
  'article_link': 'https://www.huffingtonpost.com/entry/donna-edwards-inequality_us_57455f7fe4b055bb1170b207'},
 {'is_sarcastic': 0,
  'headline': 'eat your veggies: 9 deliciously different recipes',
  'article_link': 'https://www.huffingtonpost.com/entry/eat-your-veggies-9-delici_b_8899742.html'},
 {'is_sarcastic': 1,
  'headline': 'inclement weather prevents liar from getting to work',
  'article_link': 'https://local.theonion.com/inclement-weather-prevents-liar-from-getting-to-work-1819576031'},
 {'is_sarcastic': 1,
  'headline': "mother comes pretty close to using word 'streaming' correctly",
  'article_link': 'https://www.theonion.com/mother-comes-pretty-

In [75]:
sentences = []
labels = []
links = []

for item in data:
    sentence = item['headline'].lower()
    sentence = sentence.replace(",", " , ")
    sentence = sentence.replace(".", " . ")
    sentence = sentence.replace("-", " - ")
    sentence = sentence.replace("/", " / ")
    soup = BeautifulSoup(sentence)
    sentence = soup.get_text()
    words = sentence.split()
    filtered_sentence = ""
    for word in words:
        word = word.translate(table)
        if word not in stopwords:
            filtered_sentence = filtered_sentence + word + " "
    sentences.append(filtered_sentence)
    labels.append(item['is_sarcastic'])
    links.append(item['article_link'])

In [76]:
training_size = 25000
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [77]:
vocab_size = 20000
max_length = 10
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"


tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(training_sequences, padding='post')
print(word_index)

