<a href="https://colab.research.google.com/github/antonver/TER_ML/blob/main/Bi_GRU_cloudflower.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy seaborn nltk wordcloud keras scikit-learn tensorflow



In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Word Cloud
from wordcloud import WordCloud
# from textacy import preprocessing
from nltk.stem.snowball import SnowballStemmer
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
#from tensorflow.keras.models import Sequential #Fixed: Moved these imports
#from tensorflow.keras.layers import * #Fixed: Moved these imports
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re
import tensorflow as tf
# try: #Fixed: Removed TPU strategy setup
#     tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
#     print("TPU detected:", tpu.master())
#     tf.config.experimental_connect_to_cluster(tpu)
#     tf.tpu.experimental.initialize_tpu_system(tpu)
#     strategy = tf.distribute.TPUStrategy(tpu)
# except ValueError:
#     print("TPU not found. Using CPU/GPU instead.")
from tensorflow.keras.models import Sequential #Fixed: Imported here
from tensorflow.keras.layers import * #Fixed: Imported here



In [None]:
!pip install -U datasets



In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from nltk.stem.porter import PorterStemmer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Download NLTK resources
nltk.download('punkt')

# Load the Crowdflower dataset
print("Loading dataset...")
ds = load_dataset("tasksource/crowdflower", "text_emotion")
df = pd.DataFrame(ds['train'])

# Rename columns for consistency
df = df.rename(columns={'content': 'text', 'sentiment': 'label'})

# Initialize stemmer
stemmer = PorterStemmer()

# Text cleaning and stemming function
def clean_text_stemmer(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"@\w+|#\w+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

# Apply preprocessing
print("Preprocessing text...")
df['cleaned_text'] = df['text'].apply(clean_text_stemmer)

# Prepare data
X = df['cleaned_text']
y = df['label']

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Tokenization and padding
max_words = 5000
maxlen = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_padded = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_padded = pad_sequences(X_test_seq, maxlen=maxlen)

# Build GRU model
print("\nBuilding GRU model...")
input_size = max_words
model = Sequential()
model.add(Embedding(input_dim=input_size, output_dim=100, input_length=maxlen))
model.add(Bidirectional(GRU(64, kernel_regularizer=l2(0.01))))
model.add(BatchNormalization())
model.add(Dropout(0.6))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.6))
model.add(Dense(len(le.classes_), activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train_padded, y_train, epochs=20, batch_size=64,
                    validation_data=(X_test_padded, y_test), callbacks=[early_stopping])

# Evaluate model
print("\nGRU Model Evaluation:")
y_pred = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)

# Get unique classes and their string names
unique_classes = np.unique(np.concatenate([y_test, y_pred_classes]))
class_names = [le.classes_[cls] for cls in unique_classes]

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred_classes))
print("F1 Score (weighted):", f1_score(y_test, y_pred_classes, average='weighted'))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading dataset...
Preprocessing text...
X_train shape: (31998,)
X_test shape: (8000,)
y_train shape: (31998,)
y_test shape: (8000,)

Building GRU model...




Epoch 1/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - accuracy: 0.1041 - loss: 5.2529 - val_accuracy: 0.2313 - val_loss: 3.8830
Epoch 2/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.1758 - loss: 3.6469 - val_accuracy: 0.2449 - val_loss: 3.0526
Epoch 3/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.2124 - loss: 3.0021 - val_accuracy: 0.2496 - val_loss: 2.7099
Epoch 4/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.2347 - loss: 2.7089 - val_accuracy: 0.2600 - val_loss: 2.5342
Epoch 5/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.2469 - loss: 2.5604 - val_accuracy: 0.2246 - val_loss: 2.4781
Epoch 6/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.2743 - loss: 2.4475 - val_accuracy: 0.2209 - val_loss: 2.5541
Epoch 7/20
[1m500