# RESTART CNN LEARNING

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, TextVectorization
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

2025-08-24 15:10:18.656519: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
def dataset_file_path(fname):
    return os.path.join(os.path.join(os.getcwd(), "dataset", fname))

df = pd.read_pickle(dataset_file_path("articles_dataframe.pkl"))

bias_mapping = {'left': 0, 'left-center': 1, 'least': 2, 'right-center': 3, 'right': 4 }
bias_mapping_reverse = dict((v,k) for k,v in bias_mapping.items())

x_train, x_test, y_train, y_test = train_test_split(df['article'], df['bias'], test_size=0.2, random_state=42, stratify = df['bias'])

- 80/20 test/train split before fitting tokenizer
- tensorflow.keras.preprocessing.text.Tokenizer is deprecated and is not recommended for new code. Exploring [tf.keras.layers.TextVectorization](https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization) instead

In [9]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(60000,)
(15000,)
(60000,)
(15000,)


In [11]:
# https://adriangb.com/scikeras/stable/migration.html#why-switch-to-scikeras
# requires `pip install scikeras`
from scikeras.wrappers import KerasClassifier

In [38]:
def create_model():
    model = Sequential()
    
    model.add(Embedding(5000, 1000))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64,activation='relu',kernel_initializer=kernel_initializer))
    model.add(Dropout(dropout))
    model.add(Dense(5, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [43]:
clf = KerasClassifier(build_fn=create_model,verbose=3, dropout=0.4)

In [44]:
int_vectorize_layer = TextVectorization(
    max_tokens=5000,
    output_mode='int',
    output_sequence_length=200)

int_vectorize_layer.adapt(x_train)

In [45]:
x_train_vec = int_vectorize_layer(x_train)

In [48]:
y_train_tf = tf.keras.utils.to_categorical(y_train)

In [46]:
param_grid = {
    'dropout': [0.5],
}

In [49]:
from sklearn.model_selection import GridSearchCV

cv_grid = GridSearchCV(
    clf,
    param_grid = param_grid,
    scoring = "accuracy",
    verbose = 3,
    n_jobs = 1)

cv_grid.fit(x_train_vec, y_train_tf)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got array([12000, 12001, 12002, ..., 59997, 59998, 59999])

In [33]:
cv_grid.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__model', 'estimator__build_fn', 'estimator__warm_start', 'estimator__random_state', 'estimator__optimizer', 'estimator__loss', 'estimator__metrics', 'estimator__batch_size', 'estimator__validation_batch_size', 'estimator__verbose', 'estimator__callbacks', 'estimator__validation_split', 'estimator__shuffle', 'estimator__run_eagerly', 'estimator__epochs', 'estimator__class_weight', 'estimator', 'n_jobs', 'param_grid', 'pre_dispatch', 'refit', 'return_train_score', 'scoring', 'verbose'])

#### TextVectorization
 - output_mode = `int` since the order of words in the text changes their context. Will use an Embedding layer for blah de blah.. why is the embedding layer important here? What does it do????

In [6]:
VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 1000
TRAIN_SET_SIZE = 60000

int_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)

In [7]:
# Hits OOM error with set size = 120000
# Solution is to stream the records using a TFRecordDataset as described here https://www.tensorflow.org/tutorials/load_data/tfrecord
# However, I'll reduce the size of the training set instead.
int_vectorize_layer.adapt(x_train[:TRAIN_SET_SIZE])

In [None]:
print(int_vectorize_layer(x_train.iloc[[0]]).numpy())

In [None]:
int_vectorize_layer.get_vocabulary()

In [8]:
model = Sequential([
    Embedding(VOCAB_SIZE, MAX_SEQUENCE_LENGTH),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(5, activation='softmax')
])

In [None]:
v_train = int_vectorize_layer(x_train[:TRAIN_SET_SIZE])

In [10]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
#model.fit(v_train, y_train[:1000], batch_size=32, epochs=5, validation_split=0.2)

In [None]:
# TBD: plot the curve and comment on the point at which the validation loss starts increasing.
# TBD: save the best model, and retreive it!

v_train_y = tf.keras.utils.to_categorical(y_train[:TRAIN_SET_SIZE])
model.fit(v_train, v_train_y, batch_size=32, epochs=7, validation_split=0.2)