In [None]:
# Import necessary libraries
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
import fasttext
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# Ensure necessary NLTK data is downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# ----------------------------------------------
# Data Loading and Initial Preprocessing
# ----------------------------------------------

# Read the true and fake news data files
df_true = pd.read_csv('True.csv')
df_fake = pd.read_csv('Fake.csv')

# Create a new column 'value' and assign '1' for true news and '0' for fake news
df_true['value'] = '1'
df_fake['value'] = '0'

# Concatenate the true and fake dataframes
df = pd.concat([df_true, df_fake], axis=0)

# ----------------------------------------------
# Text Cleaning and Preprocessing
# ----------------------------------------------

# Function to clean text data
def clean_text(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters

# Clean the 'title' column
df['title'] = df['title'].apply(clean_text)

# Tokenize the 'title' column
tokenizer = WhitespaceTokenizer()
df['title'] = df['title'].apply(tokenizer.tokenize)

# Remove stopwords and perform lemmatization
stopwords_list = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(tokens):
    lowercase_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stopwords_list]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in lowercase_tokens]
    return lemmatized_tokens

df['title'] = df['title'].apply(preprocess_text)

# Save the preprocessed data to a new CSV file
df.to_csv('preprocessed_news.csv', index=False)

# ----------------------------------------------
# FastText Vectorization
# ----------------------------------------------

# Prepare the training data file for FastText
with open('fasttext_training_data.txt', 'w') as f:
    for i, title in enumerate(df['title']):
        label = '__label__' + str(df['value'].iloc[i])
        title_str = ' '.join(str(token) for token in title)
        f.write(label + ' ' + title_str + '\n')

# Train the FastText model
fasttext_model = fasttext.train_supervised(input='fasttext_training_data.txt')

# Load the pre-trained FastText word embeddings
ft = fasttext.load_model('cc.en.300.bin')

# Obtain the vector representation for each title
title_vectors = []
for title in df['title']:
    embeddings = [ft.get_word_vector(str(word)) for word in title]
    title_vector = np.mean(embeddings, axis=0)
    title_vectors.append(title_vector)

# Convert the list of title vectors to a NumPy array
X = np.array(title_vectors)
y = df['value'].values

# ----------------------------------------------
# Shuffle and Split Data
# ----------------------------------------------

# Shuffle the data
X, y = shuffle(X, y, random_state=42)

# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Convert the target values to float
y_train = y_train.astype(float)
y_val = y_val.astype(float)
y_test = y_test.astype(float)

# ----------------------------------------------
# Define and Compile the CNN Model
# ----------------------------------------------

# Define the CNN architecture
model = Sequential()
model.add(Embedding(input_dim=X.shape[0], output_dim=X.shape[1], input_length=X.shape[1]))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# ----------------------------------------------
# Train the Model
# ----------------------------------------------

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# ----------------------------------------------
# Evaluate the Model
# ----------------------------------------------

# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

# Make predictions on the testing set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# Calculate the accuracy of the model
test_accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')
