In [1]:
# Tensorflow.
# conda install tensorflow
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('./News_Category_Dataset_v3.csv')
# Assume df is your DataFrame after loading the CSV
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df = df.drop('date', axis=1)  # assuming 'date' conversion is done as before

# Encode categorical variables (excluding text for now)
# For simplicity, let's assume 'authors' is your only categorical non-text column
encoder = OneHotEncoder(sparse=False)
authors_encoded = encoder.fit_transform(df[['authors']])
# Assume we've dropped or encoded other object-type columns

# Prepare target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['category'])
y_categorical = to_categorical(y_encoded)  # Convert labels to one-hot encoding for ANN

# Prepare features (assuming you're dropping text columns for now)
X = df.drop(['category', 'authors', 'headline', 'short_description'], axis=1)
# X = pd.concat([X, pd.DataFrame(authors_encoded)], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(y_categorical.shape[1], activation='softmax')  # Output layer size based on number of categories
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

history = model.fit(X_train_scaled, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

loss, accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")


2024-02-06 14:08:13.416855: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                448       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 42)                1386      
                                                                 
Total params: 3914 (15.29 KB)
Trainable params: 3914 (15.29 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 29.35%
