# **Week 1** 

In [12]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

tokenizer = Tokenizer(num_words = 20, oov_token = '<00V>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, maxlen=5, padding = 'post')
print("\nWord Index = ", word_index)
print("\nsequences = ", word_index)
print("\nPadded Sequences:")
print(padded)

# Try with words that the tokenizer wasn't fit to
test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)

padded = pad_sequences(test_seq, maxlen = 10)
print("\nPadded Test Sequence: ")
print(padded)


Word Index =  {'<00V>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

sequences =  {'<00V>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

Padded Sequences:
[[ 5  3  2  4  0]
 [ 5  3  2  7  0]
 [ 6  3  2  4  0]
 [ 9  2  4 10 11]]

Test Sequence =  [[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]

Padded Test Sequence: 
[[0 0 0 0 0 5 1 3 2 4]
 [0 0 0 0 0 2 4 1 2 1]]


# Is it sarcastic???
#### **Step 1** - download the json file  containing the dataset


In [13]:
import wget
url = "https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json"
wget.download(url)


'sarcasm.json'

### **Step 2** - use json to read the data and store it in python format

In [17]:
import json

with open("sarcasm.json", 'r') as f:
    datastore = json.load(f)
    
sentences = []
labels = []
urls = []

for line in datastore:
    sentences.append(line['headline'])
    labels.append(line['is_sarcastic'])
    urls.append(line['article_link'])
   


#### **Step 3** - use tokenizer  

In [22]:
tokenizer = Tokenizer(oov_token = '<00V>')
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
print(len(word_index))

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding = 'post')

print(padded[0])
print(padded.shape)

29657
[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


# BBC news archive

#### **Step 1** download the data

In [23]:
import wget
url = "https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv"
wget.download(url)

'bbc-text (1).csv'

#### **Step 2** - define the stopwords that will not be used

In [24]:
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"]

#### **Step 3** - read the data and remove stopwords

In [33]:
import csv

sentences = []
labels = []

with open("bbc-text.csv", 'r') as csvfile:
    bbc_file = csv.reader(csvfile, delimiter = ',')
    next(bbc_file)
    sentence = []
    for line in bbc_file:
        labels.append(line[0])
        sentence = line[1]
        for word in stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ")
            sentence = sentence.replace("  ", " ")
        sentences.append(sentence)        

print(len(sentences))       

2225


#### **Step 4** - Use Tokenizer to transform words in numbers

In [38]:
tokenizer = Tokenizer(oov_token = "<00V>")
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
print(len(word_index))

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')

print(padded[0])
print(padded.shape)

29714


#### **Step 5** - do the same thing also for the labels

In [40]:
label_token = Tokenizer(oov_token='<00V>')
label_token.fit_on_texts(labels)

label_word_index = label_token.word_index
label_seq = tokenizer.texts_to_sequences(labels)

print(label_seq[0])
print(label_word_index)

[1715]
{'<00V>': 1, 'sport': 2, 'business': 3, 'politics': 4, 'tech': 5, 'entertainment': 6}
