# Extra Credit

For this notebook I want to explore more NLP topics from a tensorflow perspective. These include 

1. Skipgram processing
1. Word2Vec using embeddings (see https://www.tensorflow.org/tutorials/text/word2vec)
1. Building a spam classifier that uses Word2Vec transformed data and skipgrams ("more meaningful")

In [2]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Embedding, Flatten, Dense, SimpleRNN, LSTM

print(tf.__version__)

2.12.0-dev20221212


In [3]:
train_file_path = "data/train-data.tsv"
test_file_path = "data/valid-data.tsv"
header = ["labels", "text"]

train_df = pd.read_csv(train_file_path, sep="\t", header = None, names = header)
test_df = pd.read_csv(test_file_path, sep="\t", header = None, names = header)

train_text, test_text = train_df.text, test_df.text
train_labels = np.where(train_df.labels == "ham", 0, 1)
test_labels = np.where(test_df.labels == "ham", 0, 1)


In [4]:
def text_encoded(tokenizer):
    def inner(text, maxlen, truncating = "post", padding = "post"):
        sequences = tokenizer.texts_to_sequences(text)
        padded = pad_sequences(sequences, maxlen = maxlen, padding = padding, truncating = truncating)
        return padded
    return inner



# Run Once

Get the data into a local storage folder

Requirement: make a `data` folder here.

In [2]:
import requests
import os

urls = ["https://cdn.freecodecamp.org/project-data/sms/train-data.tsv", "https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv"]

for url in urls:
    filename = os.path.join("data", os.path.basename(url))
    with open(filename , 'wb') as fh:
        fh.write(requests.get(url).content)