In [1]:
import fasttext
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import string
from collections import Counter

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load dataset
dsk = pd.read_excel('/kaggle/input/kurdishkdfnd/KDFND_Anlyzed_Cleaned_Filtered_Labeld.xlsx')


In [4]:
# Remove null Rows
dsk = dsk.dropna(subset=['Text'])
dsk = dsk.dropna(subset=['Text_Translate_to_English'])

# Remove columns that not needed
dsk["Article"] = dsk["Text_Translate_to_English"]
dsk['label'] = dsk['label'].map({'Real': 0, 'Fake': 1})  # Convert labels to 0 and 1
dsk = dsk[['Article', 'label']].dropna()


In [5]:
from sklearn.utils import resample
# Assuming 'dsk' is your DataFrame and you have a binary label column called 'label'
# Split the dataset into majority and minority classes
majority_class = dsk[dsk['label'] == 0]
minority_class = dsk[dsk['label'] == 1]

# Perform oversampling on the minority class # Sample with replacement  # Match majority size
###minority_oversampled = resample(minority_class,replace=True, n_samples=len(majority_class), random_state=42)  # For reproducibility
# Perform undersampling on the minority class
majority_undersampled = resample(majority_class,replace=True, n_samples=len(minority_class), random_state=42)  # For reproducibility

# Combine majority class with the oversampled minority class
###dskb = pd.concat([majority_class, minority_oversampled])
# Combine majority class with the undersampled minority class
#dskb = pd.concat([minority_class, majority_undersampled])

# Shuffle the dataset
#dskb = dskb.sample(frac=1, random_state=42).reset_index(drop=True)
#print("Balanced class distribution:")
#print(dskb['label'].value_counts())

In [6]:
# Text Preprocessing

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters
        text = text.lower().strip()
        words = text.split()
        #words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words or word in ['not', 'never']]
        return ' '.join(words)
    return ''

dsk['Article'] = dsk['Article'].apply(preprocess_text)

In [7]:
dsk['label1'] = "__label__" + dsk['label'].astype(str)

In [8]:
dsk['label_description'] = dsk['label1'].astype(str) + " " + dsk['Article'].astype(str)
dsk

Unnamed: 0,Article,label,label1,label_description
0,archive citizens burn iraqi passports together...,1,__label__1,__label__1 archive citizens burn iraqi passpor...
1,its still beautiful to put a heart on it,1,__label__1,__label__1 its still beautiful to put a heart ...
2,ranya hospital is very crowded according to th...,1,__label__1,__label__1 ranya hospital is very crowded acco...
3,now ranya,1,__label__1,__label__1 now ranya
4,good friends care about each other close frien...,1,__label__1,__label__1 good friends care about each other ...
...,...,...,...,...
100957,kirkuk a yearold man is being expelled from hi...,0,__label__0,__label__0 kirkuk a yearold man is being expel...
100958,an explosion occurred in the green area of bag...,0,__label__0,__label__0 an explosion occurred in the green ...
100959,what did they talk about read detailed informa...,0,__label__0,__label__0 what did they talk about read detai...
100960,iran announced the death of a demonstrator,0,__label__0,__label__0 iran announced the death of a demon...


In [9]:
train, test = train_test_split(dsk, test_size = 0.2)

In [10]:
train.to_csv("fake_news_train.txt", columns = ['label_description'], index=False, sep=' ', header=False,
    quoting=3, escapechar=' ', mode='w')
test.to_csv("fake_news_test.txt", columns = ['label_description'], index=False, sep=' ', header=False,
    quoting=3, escapechar=' ', mode='w')

In [11]:
# Train FastText Model
fasttext_model = fasttext.train_supervised(input="fake_news_train.txt", lr=0.5, epoch=25, wordNgrams=2, dim=300)
y_predic = fasttext_model.test("fake_news_test.txt")

In [12]:
N = y_predic[0]
P = y_predic[1]
R = y_predic[2]

print(f"No. of Test: {N:}")
print(f"Precision: {P:.6f}")
print(f"Recall: {R:.6f}")

print(f"F1-Score: {2*((P*R)/(P+R)):.6f}")


No. of Test: 20193
Precision: 0.778636
Recall: 0.778636
F1-Score: 0.778636


In [13]:
# Function to get FastText vector safely
def fasttext_vector(text):
    if isinstance(text, str):  # Ensure text is a string
        text = text.replace("\n", " ").strip()  # Remove newlines
        return fasttext_model.get_sentence_vector(text)
    return np.zeros(300)  # Return zero vector for empty/non-string values

# Apply FastText vectors to dataset
X_fasttext = np.array([fasttext_vector(text) for text in dsk['Article']])
# Function to get FastText vector
y = np.array(dsk['label'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_fasttext, y, test_size=0.2, random_state=42)

In [14]:
# Tokenization
max_words = 10000
max_len = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(dsk['Article'])
X_sequences = tokenizer.texts_to_sequences(dsk['Article'])
X_padded = pad_sequences(X_sequences, maxlen=max_len)


In [15]:
# Embedding Matrix from FastText
embedding_matrix = np.zeros((max_words, 300))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_matrix[i] = fasttext_model.get_word_vector(word)


In [18]:
# LSTM Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=300, weights=[embedding_matrix], input_length=max_len, trainable=True),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    LSTM(32, dropout=0.2, recurrent_dropout=0.2),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])





In [19]:
# Train Model
model.fit(X_padded, y, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate
y_pred = (model.predict(X_padded) > 0.5).astype(int)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

print(f"Hybrid FastText-LSTM Model → Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

Epoch 1/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m543s[0m 426ms/step - accuracy: 0.7977 - loss: 0.4599 - val_accuracy: 0.7158 - val_loss: 0.6216
Epoch 2/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m533s[0m 422ms/step - accuracy: 0.8439 - loss: 0.3742 - val_accuracy: 0.7595 - val_loss: 0.5486
Epoch 3/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m528s[0m 419ms/step - accuracy: 0.8635 - loss: 0.3299 - val_accuracy: 0.7122 - val_loss: 0.7321
Epoch 4/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m531s[0m 421ms/step - accuracy: 0.8804 - loss: 0.2937 - val_accuracy: 0.7656 - val_loss: 0.5694
Epoch 5/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m527s[0m 417ms/step - accuracy: 0.8941 - loss: 0.2630 - val_accuracy: 0.7246 - val_loss: 0.7162
[1m3156/3156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 81ms/step
Hybrid FastText-LSTM Model → Accuracy: 0.8795, Precision: 0.8354, Recall: 0.9438, F1-

In [16]:
# LSTM Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=300, weights=[embedding_matrix], input_length=max_len, trainable=True),
    LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
    #LSTM(32, dropout=0.2, recurrent_dropout=0.2),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [17]:
# Train Model
model.fit(X_padded, y, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate
y_pred = (model.predict(X_padded) > 0.5).astype(int)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

print(f"Hybrid FastText-LSTM Model → Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

Epoch 1/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 186ms/step - accuracy: 0.7984 - loss: 0.4518 - val_accuracy: 0.7668 - val_loss: 0.5210
Epoch 2/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 184ms/step - accuracy: 0.8431 - loss: 0.3736 - val_accuracy: 0.7845 - val_loss: 0.5336
Epoch 3/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 183ms/step - accuracy: 0.8646 - loss: 0.3227 - val_accuracy: 0.7606 - val_loss: 0.6001
Epoch 4/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 185ms/step - accuracy: 0.8835 - loss: 0.2782 - val_accuracy: 0.7667 - val_loss: 0.6138
Epoch 5/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 183ms/step - accuracy: 0.9024 - loss: 0.2352 - val_accuracy: 0.7517 - val_loss: 0.7226
[1m3156/3156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 37ms/step
Hybrid FastText-LSTM Model → Accuracy: 0.8903, Precision: 0.8532, Recall: 0.9414, F1-

In [18]:
model = Sequential([
    #Embedding(input_dim=max_words, output_dim=300, weights=[embedding_matrix], input_length=max_len, trainable=True),
    LSTM(128, return_sequences=False, input_shape=(X_train.shape[1], 1)),
    Dropout(0.3),
    #LSTM(64),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile & Train
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(**kwargs)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model.fit(X_train, y_train, epochs=3, batch_size=64, validation_data=(X_test, y_test))


y_pred = (model.predict(X_test) > 0.5).astype(int)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")

Epoch 1/3
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 308ms/step - accuracy: 0.6733 - loss: 0.6376 - val_accuracy: 0.9429 - val_loss: 0.1884
Epoch 2/3
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m386s[0m 306ms/step - accuracy: 0.9439 - loss: 0.1947 - val_accuracy: 0.9497 - val_loss: 0.1753
Epoch 3/3
[1m 812/1262[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m2:03[0m 275ms/step - accuracy: 0.9475 - loss: 0.1838