In [1]:
import pandas as pd
import numpy as np
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import (
    Input, Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Bidirectional,
    Attention, Dense, Dropout, Concatenate
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adadelta
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils import class_weight
from gensim.models import KeyedVectors

# Load the dataset
def load_data(filepath):
    data = pd.read_json(filepath, lines=True)
    sentences = data["headline"].values
    labels = data["is_sarcastic"].values
    return sentences, labels

# Preprocess the dataset
def preprocess(sentences, labels, max_words=10000, max_len=30):
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(sentences)
    
    sequences = tokenizer.texts_to_sequences(sentences)
    padded = pad_sequences(sequences, maxlen=max_len, padding="post")
    
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)
    
    # Save the tokenizer for later use
    with open("tokenizer.pkl", "wb") as file:
        pickle.dump(tokenizer, file)
    
    return padded, encoded_labels, tokenizer


In [2]:
# Load pre-trained Word2Vec embeddings
def load_pretrained_embeddings(word_index, embedding_dim=300, embedding_file="GoogleNews-vectors-negative300.bin"):
    word2vec = KeyedVectors.load_word2vec_format(embedding_file, binary=True, limit=500000)
    vocab_size = len(word_index) + 1
    embedding_matrix = np.random.uniform(-0.05, 0.05, (vocab_size, embedding_dim))  # Random initialization

    for word, i in word_index.items():
        if word in word2vec:
            embedding_matrix[i] = word2vec[word]
    return embedding_matrix


In [3]:
# Build the model
def build_model_with_embeddings(vocab_size, max_len, embedding_matrix, embedding_dim, filters=128, filter_width=3,
                                 hidden_units=64, dropout_fraction=0.5):
    input_layer = Input(shape=(max_len,))
    embedding_layer = Embedding(input_dim=vocab_size, 
                                output_dim=embedding_dim,
                                weights=[embedding_matrix],
                                trainable=True)(input_layer)

    # CNN Module
    cnn_layer = Conv1D(filters=filters, kernel_size=filter_width, activation="relu")(embedding_layer)
    cnn_pooling = GlobalMaxPooling1D()(cnn_layer)

    # BiLSTM Module
    lstm_layer = Bidirectional(LSTM(hidden_units, return_sequences=True))(embedding_layer)
    lstm_attention = Attention()([lstm_layer, lstm_layer])
    lstm_output = GlobalMaxPooling1D()(lstm_attention)

    # Combine CNN and LSTM outputs
    combined = Concatenate()([cnn_pooling, lstm_output])

    # Fully connected layers
    dense_layer = Dense(hidden_units, activation="relu")(combined)
    dropout_layer = Dropout(dropout_fraction)(dense_layer)
    output_layer = Dense(1, activation="sigmoid")(dropout_layer)

    # Build model
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adadelta(learning_rate=1.0), loss="binary_crossentropy", metrics=["accuracy"])
    return model



In [4]:
# Hyperparameter tuning with grid search
def tune_hyperparameters(X_train, y_train, X_val, y_val, vocab_size, max_len, embedding_matrix, embedding_dim):
    param_grid = {
        'filters': [64, 128],
        'filter_width': [3, 5],
        'hidden_units': [64, 128],
        'dropout_fraction': [0.3, 0.5]
    }

    best_model = None
    best_accuracy = 0
    for filters in param_grid['filters']:
        for filter_width in param_grid['filter_width']:
            for hidden_units in param_grid['hidden_units']:
                for dropout_fraction in param_grid['dropout_fraction']:
                    print(f"Testing with filters={filters}, filter_width={filter_width}, hidden_units={hidden_units}, dropout_fraction={dropout_fraction}")
                    model = build_model_with_embeddings(vocab_size, max_len, embedding_matrix, embedding_dim, 
                                                        filters=filters, filter_width=filter_width, 
                                                        hidden_units=hidden_units, dropout_fraction=dropout_fraction)
                    history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                                        epochs=5, batch_size=32, verbose=0)
                    val_accuracy = max(history.history['val_accuracy'])
                    if val_accuracy > best_accuracy:
                        best_accuracy = val_accuracy
                        best_model = model

    print(f"Best validation accuracy: {best_accuracy:.4f}")
    return best_model



In [10]:
# Main Workflow
filepath = "Sarcasm_Headlines_Dataset.json"
sentences, labels = load_data(filepath)

# Preprocess the data
max_len = 30
max_words = 10000
padded, encoded_labels, tokenizer = preprocess(sentences, labels, max_words=max_words, max_len=max_len)

# Load pre-trained embeddings
embedding_dim = 300
embedding_matrix = load_pretrained_embeddings(tokenizer.word_index, embedding_dim)

# Split the data into train, validation, and test sets (80:10:10)
X_train, X_temp, y_train, y_temp = train_test_split(padded, encoded_labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Compute class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

# Perform grid search to find the best model
vocab_size = len(tokenizer.word_index) + 1
best_model = tune_hyperparameters(X_train, y_train, X_val, y_val, vocab_size, max_len, embedding_matrix, embedding_dim)

# Train the best model with early stopping
early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
best_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32, class_weight=class_weights_dict, callbacks=[early_stopping])

# Evaluate the model on the test set
test_loss, test_accuracy = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}, Test Loss: {test_loss:.2f}")

# Save the final model
best_model.save("sarcasm_model_with_tuning.keras")

EOFError: unexpected end of input; is count incorrect or file otherwise damaged?