In [1]:
import fasttext
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import string
from collections import Counter


In [2]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load dataset
dse = pd.read_excel('/kaggle/input/kurdishenglish/KDFND_Anlyzed_Cleaned_Filtered_Labeld.xlsx')


In [4]:
dsk = dse.dropna(subset=['Text_Translate_to_English'])
dsk["Article"] = dsk["Text_Translate_to_English"]
dsk['label'] = dsk['label'].map({'Real': 0, 'Fake': 1})  # Convert labels to 0 and 1
dsk = dsk[['Article', 'label']].dropna()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsk["Article"] = dsk["Text_Translate_to_English"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsk['label'] = dsk['label'].map({'Real': 0, 'Fake': 1})  # Convert labels to 0 and 1


In [5]:
from sklearn.utils import resample
# Assuming 'dsk' is your DataFrame and you have a binary label column called 'label'
# Split the dataset into majority and minority classes
majority_class = dsk[dsk['label'] == 0]
minority_class = dsk[dsk['label'] == 1]

# Perform oversampling on the minority class # Sample with replacement  # Match majority size
###minority_oversampled = resample(minority_class,replace=True, n_samples=len(majority_class), random_state=42)  # For reproducibility
# Perform undersampling on the minority class
majority_undersampled = resample(majority_class,replace=True, n_samples=len(minority_class), random_state=42)  # For reproducibility

# Combine majority class with the oversampled minority class
###dskb = pd.concat([majority_class, minority_oversampled])
# Combine majority class with the undersampled minority class
dskb = pd.concat([minority_class, majority_undersampled])

# Shuffle the dataset
dskb = dskb.sample(frac=1, random_state=42).reset_index(drop=True)
print("Balanced class distribution:")
print(dskb['label'].value_counts())

Balanced class distribution:
label
1    50210
0    50210
Name: count, dtype: int64


In [6]:
# Text Preprocessing
#lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters
        text = text.lower().strip()
        words = text.split()
        #words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words or word in ['not', 'never']]
        return ' '.join(words)
    return ''

dskb['Article'] = dskb['Article'].apply(preprocess_text)

In [7]:
dskb['label1'] = "__label__" + dskb['label'].astype(str)


In [8]:
dskb['label_description'] = dskb['label1'].astype(str) + " " + dskb['Article'].astype(str)

dskb

Unnamed: 0,Article,label,label1,label_description
0,pieces of artifacts will be handed over to ira...,1,__label__1,__label__1 pieces of artifacts will be handed ...
1,because of the demonstration no one was arrest...,0,__label__0,__label__0 because of the demonstration no one...
2,the iraqi customs committee has increased its ...,0,__label__0,__label__0 the iraqi customs committee has inc...
3,other american b planes were sent to the gulf,0,__label__0,__label__0 other american b planes were sent t...
4,the minister of education has been building sc...,0,__label__0,__label__0 the minister of education has been ...
...,...,...,...,...
100415,a yearold man named osama jabbar mohammed a ba...,1,__label__1,__label__1 a yearold man named osama jabbar mo...
100416,nuri maliki the demonstrators we are doing you...,0,__label__0,__label__0 nuri maliki the demonstrators we ar...
100417,the worlds oil markets have declined and the d...,0,__label__0,__label__0 the worlds oil markets have decline...
100418,kurdistan parliament awards a shepherd for yea...,1,__label__1,__label__1 kurdistan parliament awards a sheph...


In [9]:
train, test = train_test_split(dskb, test_size = 0.2)

In [10]:
train.to_csv("fake_news_train.txt", columns = ['label_description'], index=False, sep=' ', header=False,
    quoting=3, escapechar=' ', mode='w')
test.to_csv("fake_news_test.txt", columns = ['label_description'], index=False, sep=' ', header=False,
    quoting=3, escapechar=' ', mode='w')

In [11]:
# Train FastText Model
fasttext_model = fasttext.train_supervised(input="fake_news_train.txt", lr=0.5, epoch=25, wordNgrams=2, dim=300)
y_predic = fasttext_model.test("fake_news_test.txt")

In [12]:
N = y_predic[0]
P = y_predic[1]
R = y_predic[2]

print(f"No. of Test: {N:}")
print(f"Precision: {P:.6f}")
print(f"Recall: {R:.6f}")

print(f"F1-Score: {2*((P*R)/(P+R)):.6f}")


No. of Test: 20084
Precision: 0.845449
Recall: 0.845449
F1-Score: 0.845449


In [13]:
# Function to get FastText vector safely
def fasttext_vector(text):
    if isinstance(text, str):  # Ensure text is a string
        text = text.replace("\n", " ").strip()  # Remove newlines
        return fasttext_model.get_sentence_vector(text)
    return np.zeros(300)  # Return zero vector for empty/non-string values

# Apply FastText vectors to dataset
X_fasttext = np.array([fasttext_vector(text) for text in dskb['Article']])
# Function to get FastText vector
y = np.array(dskb['label'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_fasttext, y, test_size=0.2, random_state=42)

In [14]:
# Tokenization
max_words = 10000
max_len = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(dskb['Article'])
X_sequences = tokenizer.texts_to_sequences(dskb['Article'])
X_padded = pad_sequences(X_sequences, maxlen=max_len)


In [15]:
# Embedding Matrix from FastText
embedding_matrix = np.zeros((max_words, 300))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_matrix[i] = fasttext_model.get_word_vector(word)


In [18]:
# LSTM Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=300, weights=[embedding_matrix], input_length=max_len, trainable=True),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    LSTM(32, dropout=0.2, recurrent_dropout=0.2),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])





In [19]:
# Train Model
model.fit(X_padded, y, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate
y_pred = (model.predict(X_padded) > 0.5).astype(int)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

print(f"Hybrid FastText-LSTM Model → Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

Epoch 1/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m563s[0m 443ms/step - accuracy: 0.7967 - loss: 0.4611 - val_accuracy: 0.8265 - val_loss: 0.3956
Epoch 2/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m552s[0m 440ms/step - accuracy: 0.8441 - loss: 0.3738 - val_accuracy: 0.8305 - val_loss: 0.3948
Epoch 3/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m550s[0m 438ms/step - accuracy: 0.8699 - loss: 0.3171 - val_accuracy: 0.8387 - val_loss: 0.3881
Epoch 4/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m551s[0m 439ms/step - accuracy: 0.8912 - loss: 0.2688 - val_accuracy: 0.8413 - val_loss: 0.3999
Epoch 5/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m551s[0m 438ms/step - accuracy: 0.9073 - loss: 0.2305 - val_accuracy: 0.8430 - val_loss: 0.4293
[1m3139/3139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 78ms/step
Hybrid FastText-LSTM Model → Accuracy: 0.9149, Precision: 0.9332, Recall: 0.8937, F1-

In [16]:
# LSTM Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=300, weights=[embedding_matrix], input_length=max_len, trainable=True),
    LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
    #LSTM(32, dropout=0.2, recurrent_dropout=0.2),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])





In [17]:
# Train Model
model.fit(X_padded, y, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate
y_pred = (model.predict(X_padded) > 0.5).astype(int)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

print(f"Hybrid FastText-LSTM Model → Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

Epoch 1/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 216ms/step - accuracy: 0.7934 - loss: 0.4623 - val_accuracy: 0.8290 - val_loss: 0.3925
Epoch 2/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 209ms/step - accuracy: 0.8489 - loss: 0.3643 - val_accuracy: 0.8367 - val_loss: 0.3850
Epoch 3/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 212ms/step - accuracy: 0.8757 - loss: 0.3046 - val_accuracy: 0.8417 - val_loss: 0.3891
Epoch 4/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 203ms/step - accuracy: 0.9003 - loss: 0.2491 - val_accuracy: 0.8462 - val_loss: 0.4127
Epoch 5/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 211ms/step - accuracy: 0.9192 - loss: 0.2076 - val_accuracy: 0.8443 - val_loss: 0.4390
[1m3139/3139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 41ms/step
Hybrid FastText-LSTM Model → Accuracy: 0.9211, Precision: 0.9235, Recall: 0.9183, F1-