# RESTART CNN LEARNING

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, TextVectorization
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

2025-08-22 14:16:23.396475: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset_dir = os.path.join(os.getcwd(), "dataset")
training_data_file = os.path.join(dataset_dir, "articles-validation-bypublisher-20181122-html-escaped.xml")
target_data_file = os.path.join(dataset_dir, "ground-truth-validation-bypublisher-20181122.xml")

In [3]:
datadf = pd.read_xml(path_or_buffer = training_data_file)
targetdf = pd.read_xml(path_or_buffer = target_data_file)
df = pd.concat([datadf, targetdf], axis = 1)
df.columns = [
    'id',
    'published-at',
    'title',
    'article',
    'id2',
    'hyperpartisan',
    'bias',
    'url',
    'labeled-by'
]

- 80/20 test/train split before fitting tokenizer
- tensorflow.keras.preprocessing.text.Tokenizer is deprecated and is not recommended for new code. Exploring [tf.keras.layers.TextVectorization](https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization) instead

In [4]:
bias_mapping = {'left': 0, 'left-center': 1, 'least': 2, 'right-center': 3, 'right': 4 }
bias_mapping_reverse = dict((v,k) for k,v in bias_mapping.items())
df['bias'] = df['bias'].map(bias_mapping)

x_train, x_test, y_train, y_test = train_test_split(df['article'], df['bias'], test_size=0.2, random_state=42, stratify = df['bias'])

In [46]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(120000,)
(30000,)
(120000,)
(30000,)


#### DEPRECATED
```python
tokenizer = Tokenizer(num_words = 1000)
tokenizer.fit_on_texts(x_train)
x_train_seq = tokenizer.texts_to_sequences(x_train)
```

#### TextVectorization
 - output_mode = `int` since the order of words in the text changes their context. Will use an Embedding layer for blah de blah.. why is the embedding layer important here? What does it do????

In [6]:
VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 1000
TRAIN_SET_SIZE = 60000

int_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)

In [7]:
# Hits OOM error with set size = 120000
# Solution is to stream the records using a TFRecordDataset as described here https://www.tensorflow.org/tutorials/load_data/tfrecord
# However, I'll reduce the size of the training set instead.
int_vectorize_layer.adapt(x_train[:TRAIN_SET_SIZE])

In [66]:
print(int_vectorize_layer(x_train.iloc[[0]]).numpy())

[[  43    1    1    1  399    1  662    1 1092    9    3    1 1116    5
   113 1861    1    8   23   48    1    7    1    1    6 1849    1   75
     2  813  196   28    1    7    3    1    1    1    1    5    1    6
     1    1    1    1    1    1  399    1    1    2    1  364    5    1
     9    1    1   46   36  153    4 1487   38    1   37  399    1  261
    67    1    1   76 1614    1   17    1   14 1910 1126   30  311   24
   656 1276  585    1  829    7  334    6    1    3    1    1  974   79
    18    1   92  247  130   26   24  399    6   77   24  399    1   24
   714   45    5    2    1 1505    7    2  343    5   68    1    1   17
     3    1    6   24  205  860   63  188    9    3    1    5    1 1632
     4    2  182    1  393    9   82    3    1    1    5    1  616    1
  1629    1    6    1    1   10    2    1 1451    6   12    1   51    9
    55  111  705   38    1  343   76    1    1    1    9    2    1    1
  1213    5  113  343    6 1184   45 1478    5   30    1   24   

In [None]:
int_vectorize_layer.get_vocabulary()

In [8]:
model = Sequential([
    Embedding(VOCAB_SIZE, MAX_SEQUENCE_LENGTH),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(5, activation='softmax')
])

In [9]:
v_train = int_vectorize_layer(x_train[:TRAIN_SET_SIZE])

In [10]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
#model.fit(v_train, y_train[:1000], batch_size=32, epochs=5, validation_split=0.2)

In [11]:
# TBD: plot the curve and comment on the point at which the validation loss starts increasing.
# TBD: save the best model, and retreive it!

v_train_y = tf.keras.utils.to_categorical(y_train[:TRAIN_SET_SIZE])
model.fit(v_train, v_train_y, batch_size=32, epochs=7, validation_split=0.2)

Epoch 1/7


2025-08-22 14:17:54.788022: I external/local_xla/xla/service/service.cc:163] XLA service 0x7f7724159480 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-08-22 14:17:54.788063: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA RTX A2000, Compute Capability 8.6
2025-08-22 14:17:54.812942: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-08-22 14:17:54.939619: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91200
2025-08-22 14:17:55.029313: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-08-22 14:17:55.029423: I external

[1m   2/1500[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:49[0m 73ms/step - accuracy: 0.2188 - loss: 1.6051 

I0000 00:00:1755897480.277848   23127 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 34ms/step - accuracy: 0.7574 - loss: 0.6320 - val_accuracy: 0.8538 - val_loss: 0.3965
Epoch 2/7
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 32ms/step - accuracy: 0.8943 - loss: 0.3138 - val_accuracy: 0.8634 - val_loss: 0.3818
Epoch 3/7
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 35ms/step - accuracy: 0.9450 - loss: 0.1631 - val_accuracy: 0.8671 - val_loss: 0.4237
Epoch 4/7
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 34ms/step - accuracy: 0.9685 - loss: 0.0938 - val_accuracy: 0.8699 - val_loss: 0.5096
Epoch 5/7
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 36ms/step - accuracy: 0.9762 - loss: 0.0704 - val_accuracy: 0.8713 - val_loss: 0.5492
Epoch 6/7
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 34ms/step - accuracy: 0.9818 - loss: 0.0570 - val_accuracy: 0.8668 - val_loss: 0.6324
Epoch 7/7
[1m1500/1

<keras.src.callbacks.history.History at 0x7f7968050bf0>

# END CNN LEARNING

In [1]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
import time
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

In [23]:
if 'df' not in locals() and 'df' not in globals():
    df = pd.read_pickle(os.path.join(os.getcwd(), "dataset", "articles_dataframe.pkl"))
else:
    print("df already loaded.. not loading again")

if 'bag_of_words_df' not in locals() and 'bag_of_words_df' not in globals():
    bag_of_words_df = pd.read_pickle(os.path.join(os.getcwd(), "dataset", "articles_tfidf_vectorized.pkl"))
else:
    print("bag_of_words_df already loaded.. not loading again")

x_train, x_test, y_train, y_test = train_test_split(bag_of_words_df,
                                                df['bias'],
                                                test_size=0.2,
                                                random_state=42,
                                                stratify=df['bias'])

df already loaded.. not loading again
bag_of_words_df already loaded.. not loading again


In [5]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

2025-08-21 18:11:54.010846: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
vocab_size = 10000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
x_train = sequence.pad_sequences(x_train, maxlen=max_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_length)

In [26]:
model = Sequential([
    Embedding(vocab_size, 100),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [None]:
x_train

In [25]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=32, epochs=5, validation_split=0.2)

Epoch 1/5
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 9ms/step - accuracy: 0.1575 - loss: -16494510080.0000 - val_accuracy: 0.1522 - val_loss: -73888587776.0000
Epoch 2/5
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 9ms/step - accuracy: 0.1576 - loss: -306588188672.0000 - val_accuracy: 0.1522 - val_loss: -684273041408.0000
Epoch 3/5
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 9ms/step - accuracy: 0.1576 - loss: -1422149550080.0000 - val_accuracy: 0.1522 - val_loss: -2440861581312.0000
Epoch 4/5
[1m  77/3000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m22s[0m 8ms/step - accuracy: 0.1565 - loss: -2469632409600.0000

KeyboardInterrupt: 

In [14]:
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8887 - loss: 0.4749
Test Accuracy: 0.8887


In [20]:
x_train.shape

(25000, 500)

In [22]:
x_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   