In [None]:
!wget https://raw.githubusercontent.com/cbtn-data-science-ml/tensorflow-professional-developer/main/model_utils.py

In [None]:
from model_utils import plot_loss_and_accuracy, early_stopping_callback, model_checkpoint_callback

In [None]:
# Clone repo
!git clone https://github.com/cbtn-data-science-ml/tensorflow-professional-developer.git

| Feature                | `!cd` (Shell Command)       | `%cd` (Magic Command)                  |
|------------------------|-----------------------------|----------------------------------------|
| **Scope**             | Temporary (subshell only)   | Persistent (notebook-wide)            |
| **Effect on Notebook**| No effect on working dir    | Changes notebook's working dir        |
| **Use Case**          | One-off shell commands      | Lasting directory changes             |

In [None]:
# Print working directory !pwd or %pwd?
!pwd

In [None]:
# Change directory
%cd '/content/tensorflow-professional-developer'

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dropout


In [None]:
train_path = 'nlp_disaster_tweets/train.csv'
test_path = 'nlp_disaster_tweets/test.csv'
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

## EDA

In [None]:
train_df.head()

In [None]:
train_df['text'].tail()

In [None]:
# Good idea to shuffle the data
train_df = train_df.sample(frac=1, random_state=42)
test_df = test_df.sample(frac=1, random_state=42)

In [None]:
train_df['text'].tail()

In [None]:
train_df.target.value_counts() # Is the dataset balanced? close enought to 50/50 IMO
# If imbalanced see: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data

In [None]:
len(train_df), len(test_df)

In [None]:
# Sample 5 random tweets and their classification
random_samples = train_df.sample(n=10, random_state=42)
print(random_samples[['text', 'target']])

In [None]:
# For the training dataset
print(train_df.info())
print(train_df.describe())

In [None]:
# For the test dataset
print(test_df.info())
print(test_df.describe())

In [None]:
# Calculate word counts for each tweet
train_df['word_count'] = train_df['text'].apply(lambda x: len(str(x).split()))

plt.figure(figsize=(10, 6))
sns.histplot(train_df['word_count'], bins=30, kde=True)
plt.title('Word Count Distribution in Tweets')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()


In [None]:
sns.countplot(x='target', data=random_samples)
plt.title('Class Distribution in Random Samples')
plt.xlabel('Disaster Tweets (1) vs. Non-Disaster Tweets (0)')
plt.ylabel('Count')
plt.show()


# Challenge

In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam

# Preprocessing
nltk.download('stopwords') # 'the', 'is', 'in', 'and'

def clean_text(text):
  text = text.lower()
  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
  text = re.sub(r'<.*?>', '', text)
  text = re.sub(r'[^a-z\s]', '', text)
  stop_words = set(stopwords.words('english'))
  text = " ".join([word for word in text.split() if word not in stop_words])
  return text

train_df['text_clean'] = train_df['text'].apply(clean_text)
test_df['text_clean'] = test_df['text'].apply(clean_text)


