# Sequence Models: Kaggle Competitions
This notebook uses `TextVectorization` instead of `Tokenizer` for text preprocessing.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.layers import TextVectorization, Embedding, SimpleRNN, Dense, Dropout, Bidirectional, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
import tensorflow as tf

!git clone https://github.com/cbtn-data-science-ml/tensorflow-professional-developer.git


In [None]:
# print working directory

In [None]:
# change into project directory

In [None]:
# list files and ensure you see nlp_disaster tweets

In [None]:
# convert train.csv and test.csv into DataFrame objects


In [None]:
# word count for each tweet
train_df['word_count'] = train_df['text'].apply(lambda x: len(str(x).split()))

In [None]:
# visualize 'Tweet Word Count Distribution'
plt.figure(figsize=(10, 6))
sns.histplot(train_df['word_count'], bins=30, kde=True)
plt.title('Tweet Word Count Distribution')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()

In [None]:
# use .info() on train dataset


In [None]:
# run cell to sum tweets that contain www, http, and https
urls_before_cleaning = train_df['text'].str.contains(r'http\S+|www\S+|https\S+').sum()
print(f"URLs found before cleaning: {urls_before_cleaning}")

In [None]:
# !pip install nltk # natural language toolkit (if needed, most for Jupyter notebook)

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# build function to clean our text data
def clean_text(text):
  text = text.lower()
  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
  text = re.sub(r'<.*?>', '', text)
  text = re.sub(r'[^a-z\s]', '', text)
  tokens = word_tokenize(text)
  stop_words = set(stopwords.words("english"))
  filtered_tokens = [word for word in tokens if word not in stop_words]
  lemmatizer = WordNetLemmatizer()
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
  return " ".join(lemmatized_tokens)


In [None]:
# Apply the cleaning function to train_df['text'] and test_df['text']



In [None]:
# run cell to sum tweets that contain www, http, and https
urls_before_cleaning = train_df['clean_text'].str.contains(r'http\S+|www\S+|https\S+').sum()
print(f"URLs found after cleaning: {urls_before_cleaning}")

In [None]:
# create Hyperparameters




In [None]:
# create updated TextVectorization Layer (Tokenizer() is deprecated)


In [None]:
# run cell to preprocess the data: Vectorize the text and convert X to NumPy array (needed for TextVectorization: it expects this format)


# Split the data into training and validation sets


In [None]:
# run cell to import model_utils.py and relevant functions
!wget https://raw.githubusercontent.com/cbtn-data-science-ml/introduction-to-deep-learning/main/model_utils.py
from model_utils import early_stopping_callback, model_checkpoint_callback, plot_loss_and_accuracy


In [None]:
# Build the model


# Compile model


# Train the model



In [None]:
# plot_loss_and_accuracy

# Competition Notebook

In [None]:
# Prepare Kaggle Contest Output

# Vectorize the test dataset
X_test = vectorizer(test_df['clean_text'].astype(str)).numpy()

# Predict using the trained model
predictions = model.predict(X_test)

# Convert predictions to binary (0 or 1)
predictions_binary = (predictions > 0.5).astype(int).reshape(-1)

# Create a submission file for Kaggle
submission = pd.DataFrame({'id': test_df['id'], 'target': predictions_binary})
submission.to_csv('submission.csv', index=False)


# Bonus: Embedding Projector

In [None]:
# Link: https://www.tensorflow.org/text/guide/word_embeddings

# Create a reverse index for words from the TextVectorization vocabulary
def get_reverse_index(vectorizer):
    vocabulary = vectorizer.get_vocabulary()
    return {idx: word for idx, word in enumerate(vocabulary)}

reverse_index = get_reverse_index(vectorizer)

import numpy as np

# Assuming 'model' is your trained model and the first layer is the embedding layer
embedding_layer = model.layers[0]  # The Embedding layer is the first layer in the Sequential model
embeddings = embedding_layer.get_weights()[0]

# Save the embeddings for TensorFlow Embedding Projector
np.savetxt("vectors.tsv", embeddings, delimiter="\t")

# Prepare the metadata file
with open("metadata.tsv", "w", encoding='utf-8') as f:
    # Explicitly account for the padding token
    f.write("<PAD>\n")
    # Write the words from the reverse index
    for i in range(1, len(reverse_index)):  # Skip the padding token
        f.write(reverse_index[i] + "\n")

# Download the files for TensorFlow Embedding Projector
from google.colab import files
files.download("vectors.tsv")
files.download("metadata.tsv")


# Challenge
Build a Functional API version to improve model performance. You can use this to check your score on Kaggle competition to get an idea but as per the rules, do not sumbit to the disaster tweets competition. Instead, see these to list a few:


*   https://www.kaggle.com/code/tanulsingh077/twitter-sentiment-extaction-analysis-eda-and-model
*   https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge



In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dropout, Dense

# Define the input layer


# Add embedding layer


# Add bidirectional LSTM layers


# Add dropout for regularization

# Add dense output layer


# Create the model


# Compile the model


# Train the model



In [None]:
plot_loss_and_accuracy()