In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, LSTM, GRU
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

## 1. Data Normalization

In [None]:
# Load the dataset
df = pd.read_csv('data/data.csv')

# Data normalization
df['text'] = df['text'].str.lower()  # Convert text to lowercase
tokenizer = Tokenizer(num_words=10000)  # Limit the number of words to keep
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
max_len = 500  # Set a maximum sequence length
data = pad_sequences(sequences, maxlen=max_len, padding='post')

# Encode labels
labels = pd.get_dummies(df['source']).values

# Split the dataset
X_train, X_validate, y_train, y_validate = train_test_split(data, labels, test_size=0.1, random_state=42)

## 3. RNN architectures

In [None]:
# Define three different RNN architectures
def create_model(rnn_layer, units, activation, output_units, output_activation):
    model = Sequential([
        Embedding(input_dim=10000, output_dim=100, input_length=max_len),
        rnn_layer(units, activation=activation, return_sequences=False),
        Dense(output_units, activation=output_activation)
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Instantiate models
model_1 = create_model(SimpleRNN, 50, 'relu', y_train.shape[1], 'sigmoid')
model_2 = create_model(LSTM, 100, 'tanh', y_train.shape[1], 'sigmoid')
model_3 = create_model(GRU, 150, 'relu', y_train.shape[1], 'sigmoid')

## 4. Model Training

In [None]:
# Train models
def train_and_save(model, name):
    history = model.fit(X_train, y_train, epochs=10, validation_data=(X_validate, y_validate))
    model.save(f'data/models/{name}.h5')
    return history

history1 = train_and_save(model_1, 'model_1')
history2 = train_and_save(model_2, 'model_2')
history3 = train_and_save(model_3, 'model_3')

## 5. Validation

In [None]:
# Plot validation graphs
def plot_history(history, title):
    plt.plot(history.history['accuracy'], label='accuracy')
    plt.plot(history.history['val_accuracy'], label='validation accuracy')
    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

plot_history(history1, 'Model 1 Validation')
plot_history(history2, 'Model 2 Validation')
plot_history(history3, 'Model 3 Validation')