In [None]:
import pandas as pd
import re
import numpy as np
import random
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv("Articles.csv", encoding="ISO-8859-1")

# Text Preprocessing Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    return text

df["Article"] = df["Article"].apply(clean_text)

# Selecting features and labels
X = df["Article"]  # Article text as features
y = df["NewsType"]  # Target labels

# Data Augmentation: Duplicate and slightly modify some entries
def augment_text(text):
    words = text.split()
    if len(words) > 5:
        idx = random.randint(0, len(words) - 1)
        words[idx] = words[idx][::-1]  # Reverse a random word
    return ' '.join(words)

df_augmented = df.copy()
df_augmented["Article"] = df_augmented["Article"].apply(augment_text)
df = pd.concat([df, df_augmented])

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert text data into sequences for neural networks
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform input length
max_length = max(len(seq) for seq in X_train_seq)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define Neural Network Model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(len(set(y)), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train_encoded, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test_encoded))

# Predict on test set
y_pred_nn = model.predict(X_test_padded)
y_pred_nn_classes = np.argmax(y_pred_nn, axis=1)
print(f"Neural Network Accuracy: {accuracy_score(y_test_encoded, y_pred_nn_classes):.4f}")

# Function to predict class of a given text
def predict_news_category(text):
    text = clean_text(text)
    text_seq = tokenizer.texts_to_sequences([text])
    text_padded = pad_sequences(text_seq, maxlen=max_length, padding='post', truncating='post')
    prediction = model.predict(text_padded)
    predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
    return predicted_label[0]

# Example prediction
sample_text = "The star player scores a winning goal in the final match."
predicted_class_nn = predict_news_category(sample_text)
print(f"Predicted News Category (Neural Network): {predicted_class_nn}")




Epoch 1/5
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m326s[0m 5s/step - accuracy: 0.5123 - loss: 0.6944 - val_accuracy: 0.5232 - val_loss: 0.6923
Epoch 2/5
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m568s[0m 8s/step - accuracy: 0.5068 - loss: 0.6933 - val_accuracy: 0.4768 - val_loss: 0.6932
Epoch 3/5
[1m27/68[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m3:41[0m 5s/step - accuracy: 0.4805 - loss: 0.6935