In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, TextVectorization

In [2]:
url = "https://raw.githubusercontent.com/cbtn-data-science-ml/tensorflow-professional-developer/main/18_expand_model_vocab_w_news_headlines/News_Category_Dataset_Reduced_25.json"
df = pd.read_json(url, lines=True)

In [3]:
sentences = df['headline'].tolist()
labels = df['category'].tolist()

In [31]:
df

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
52376,https://www.huffingtonpost.com/entry/richard-b...,GOP Senators No Longer Want The People To Deci...,POLITICS,That Merrick Garland talking point is now obso...,Sam Stein,2016-11-01
52377,https://www.huffingtonpost.com/entry/florissan...,A City Near Ferguson Is Still Caging Humans In...,POLITICS,Lawsuit alleges the city of Florissant was run...,Mariah Stewart and Ryan J. Reilly,2016-11-01
52378,https://www.huffingtonpost.com/entry/snickers-...,2 Michigan Families Claim They Found Nails In ...,CRIME,A firefighter and the mother of a toddler repo...,David Moye,2016-11-01
52379,https://www.huffingtonpost.com/entry/mitt-romn...,Mitt Romney Dresses As Mitt Romney For Halloween,POLITICS,"The Romneys decided to be their alter egos, so...",Carla Herreria,2016-11-01


In [33]:
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(labels)
labels_encoded = to_categorical(integer_encoded)

In [34]:
max_tokens = 20000
max_len = 50

vectorize_layer = TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=max_len
)

In [35]:
vectorize_layer.adapt(sentences)

In [36]:
vectorized_text = vectorize_layer(sentences)

In [37]:
vectorized_text.shape[1]

50

In [38]:
X_train, X_test, y_train, y_test = train_test_split(vectorized_text.numpy(), labels_encoded, test_size=0.2, random_state=42)

In [42]:
model = Sequential([
    Embedding(input_dim=max_tokens, output_dim=16),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(len(labels_encoded[0]), activation='softmax')
])

In [39]:
len(labels_encoded[0])

41

In [43]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [44]:
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
[1m1310/1310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.3364 - loss: 2.8217 - val_accuracy: 0.3438 - val_loss: 2.6142
Epoch 2/10
[1m1310/1310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.3488 - loss: 2.5589 - val_accuracy: 0.3843 - val_loss: 2.4347
Epoch 3/10
[1m1310/1310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.3947 - loss: 2.3657 - val_accuracy: 0.4183 - val_loss: 2.3122
Epoch 4/10
[1m1310/1310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.4345 - loss: 2.2205 - val_accuracy: 0.4346 - val_loss: 2.1810
Epoch 5/10
[1m1310/1310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.4732 - loss: 2.0486 - val_accuracy: 0.4853 - val_loss: 2.0403
Epoch 6/10
[1m1310/1310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5256 - loss: 1.8685 - val_accuracy: 0.5026 - val_loss: 1.9443
Epoch 7/10
[1m1