In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# Importing specific components from NLTK for text processing
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Importing components from TensorFlow Keras for building and training neural networks
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional

# Importing components from scikit-learn for data splitting and performance evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score


In [2]:
# Load dataset
news_df = pd.read_csv("/content/News/True.csv")

# Drop rows with missing values
news_df.dropna(inplace=True)

# Set vocabulary size for text processing
vocab_size = 5000

# Separate features and target variable
X = news_df.drop(["subject"], axis=1)
y = news_df['subject']

# Convert target to binary labels
y = [1 if res == "politicsNews" else 0 for res in y]

# Display the first few rows of the DataFrame
news_df.head()


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [3]:
# Copy the 'title' column from the features DataFrame
news_title = X['title'].copy()

# Download stopwords from NLTK
nltk.download("stopwords")

# Initialize the Porter Stemmer
ps = PorterStemmer()

# Initialize an empty list to store processed titles
corpus = []

# Process each title in the dataset
for i in range(len(news_title)):
    # Get the current title
    temp_title = news_title[i]

    # Remove non-alphabetic characters and replace with spaces
    temp_title = re.sub("[^A-Za-z]", " ", temp_title)

    # Convert text to lowercase
    temp_title = temp_title.lower()

    # Split the text into individual words
    temp_title = temp_title.split()

    # Stem words and remove stopwords
    temp_title = [ps.stem(word) for word in temp_title if word not in stopwords.words("english")]

    # Join the processed words back into a single string
    temp_title = " ".join(temp_title)

    # Append the processed title to the corpus
    corpus.append(temp_title)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Convert corpus data into one-hot encoded format
corpus_data = [one_hot(word, vocab_size) for word in corpus]
print(corpus_data[0])  # Print the first encoded sequence

# Determine the maximum sequence length and set a fixed length for padding
max_len = max(len(seq) for seq in corpus_data)
max_len = 20  # Set maximum length for padding
print(max_len)  # Print the fixed maximum length

# Pad sequences to ensure uniform input size
corpus_padded = pad_sequences(corpus_data, maxlen=max_len)
print(corpus_padded[0])  # Print the first padded sequence

# Build the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 300, input_length=max_len))  # Embedding layer
model.add(Dropout(0.3)) # Dropout layer
model.add(Bidirectional(LSTM(100, kernel_regularizer=l2(0.01))))  # LSTM layer
model.add(Dropout(0.3)) # Dropout layer
model.add(Dense(1, activation="sigmoid"))  # Output layer for binary classification
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy", "precision", "recall"])  # Compile the model

# Prepare data for training
X_final = np.array(corpus_padded)
y_final = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=2)
print(X_train.shape, y_train.shape, X_test.shape)  # Print shapes of training and test sets

# Train the model
model.fit(X_train, y_train, validation_data=[X_test, y_test], epochs=10, verbose=True, batch_size=64)

# Optionally print model summary to review architecture
print(model.summary())


[3231, 1283, 4413, 3411, 665, 2695, 4645, 3388]
20
[   0    0    0    0    0    0    0    0    0    0    0    0 3231 1283
 4413 3411  665 2695 4645 3388]
(17133, 20) (17133,) (4284, 20)
Epoch 1/10




[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 136ms/step - accuracy: 0.7537 - loss: 2.4739 - precision: 0.7478 - recall: 0.7997 - val_accuracy: 0.9078 - val_loss: 0.2462 - val_precision: 0.9339 - val_recall: 0.8824
Epoch 2/10
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 134ms/step - accuracy: 0.9297 - loss: 0.2113 - precision: 0.9330 - recall: 0.9366 - val_accuracy: 0.9080 - val_loss: 0.2414 - val_precision: 0.9233 - val_recall: 0.8947
Epoch 3/10
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 125ms/step - accuracy: 0.9397 - loss: 0.1803 - precision: 0.9438 - recall: 0.9429 - val_accuracy: 0.9057 - val_loss: 0.2512 - val_precision: 0.9345 - val_recall: 0.8774
Epoch 4/10
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 123ms/step - accuracy: 0.9421 - loss: 0.1696 - precision: 0.9424 - recall: 0.9475 - val_accuracy: 0.9111 - val_loss: 0.2516 - val_precision: 0.8940 - val_recall: 0.9376
Epoch 5/10
[1m268/268

None


In [7]:
# Predict probabilities for the test set
y_pred_prob = model.predict(X_test)

# Convert probabilities to binary class predictions
y_pred = (y_pred_prob > 0.5).astype("int")

# Evaluate and print performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print metrics as percentages
print("Accuracy:  {:.2f}%".format(accuracy * 100))  # Print accuracy as percentage
print("Precision: {:.2f}%".format(precision * 100)) # Print precision as percentage
print("Recall:    {:.2f}%".format(recall * 100))    # Print recall as percentage

[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step
Accuracy:  89.50%
Precision: 88.69%
Recall:    91.11%


In [6]:
################################################# END OF NOTEBOOK #################################################