In [1]:
import fasttext
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import string
from collections import Counter

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load dataset
dsk = pd.read_excel('/kaggle/input/kurdishkdfnd/KDFND_Anlyzed_Cleaned_Filtered_Labeld.xlsx')


In [4]:
dsk = dsk.dropna(subset=['Text'])
dsk["Article"] = dsk["Text"]
dsk['label'] = dsk['label'].map({'Real': 0, 'Fake': 1})  # Convert labels to 0 and 1
dsk = dsk[['Article', 'label']].dropna()


In [5]:
from sklearn.utils import resample
# Assuming 'dsk' is your DataFrame and you have a binary label column called 'label'
# Split the dataset into majority and minority classes
majority_class = dsk[dsk['label'] == 0]
minority_class = dsk[dsk['label'] == 1]

# Perform oversampling on the minority class # Sample with replacement  # Match majority size
###minority_oversampled = resample(minority_class,replace=True, n_samples=len(majority_class), random_state=42)  # For reproducibility
# Perform undersampling on the minority class
majority_undersampled = resample(majority_class,replace=True, n_samples=len(minority_class), random_state=42)  # For reproducibility

# Combine majority class with the oversampled minority class
###dskb = pd.concat([majority_class, minority_oversampled])
# Combine majority class with the undersampled minority class
dskb = pd.concat([minority_class, majority_undersampled])

# Shuffle the dataset
dskb = dskb.sample(frac=1, random_state=42).reset_index(drop=True)
print("Balanced class distribution:")
print(dskb['label'].value_counts())

Balanced class distribution:
label
0    50211
1    50211
Name: count, dtype: int64


In [6]:
# Tokenize and count word frequencies
all_words = ' '.join(dskb).split()
word_freq = Counter(all_words)

# Identify potential stopwords (e.g., words appearing very frequently)
potential_stopwords = [word for word, freq in word_freq.items() if freq > 1]

# Example stopwords list (refined manually)
kurdish_stopwords = [
     "ئێمە","ئێوە","ئەم","ئەو"
     ,"ئەوان","ئەوەی","بۆ","بێ","بێجگە","بە","بەبێ","بەدەم","بەردەم","بەرلە","بەرەوی","بەرەوە","بەلای","بەپێی","تۆ","تێ","جگە","دوای","دوو","دە"
     ,"دەکات","دەگەڵ","سەر","لێ","لە","لەبابەت","لەباتی","لەبارەی","لەبرێتی","لەبن","لەبەر","لەبەینی","لەدەم","لەرێ","لەرێگا","لەرەوی","لەسەر","لەلایەن"
     ,"لەناو","لەنێو","لەو","لەپێناوی","لەژێر","لەگەڵ","من","ناو","نێوان","هەر","هەروەها","و","وەک","پاش","پێ","پێش","چەند","کرد","کە","ی"

] + potential_stopwords

kupunctuation = {'!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?',
                 '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'}

# Convert set to list before concatenation
ku_stopwords = kurdish_stopwords + list(kupunctuation)

def rremove_stopwords(text, stop_words):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)
    
# Apply the function to each article
dskb['Article'] = dskb['Article'].apply(lambda text: rremove_stopwords(text, ku_stopwords))
dskb

Unnamed: 0,Article,label
0,شکستی ڕاهێنەرە یان پێکهاتە؟ ئارسیناڵ لەم وەرزە...,0
1,پۆلیسی دهۆک جادوگەرێکی دەستگیرکرد بزانن چی کرد...,1
2,سەرکەوت شەمسەدین نەوەی نوێ ئێستا هێزێکی ناڕەسە...,1
3,وریابە تێنەکویت درۆی حیزب وسکیولارەکان,1
4,قبوڵە بەرای تۆ؟ جەعفەر شێخ مستەفا نزا دەکەم یە...,1
...,...,...
100417,گەنجێک هەڵەبجە کۆتایی ژیانی خۆی هێنا نیوەڕوی ئ...,1
100418,فراکسیۆنەکانی گۆڕان پەرلەمانی کوردستان ئەنجومە...,0
100419,دەیان هاوڵاتی ئێرانی لیستی موچەی شەهیداندان,0
100420,باشترین ڕێزلێنان پەرلەمانی کوردستان ڕێزلێنان ئ...,1


In [7]:
def wordpre(text):
    if not isinstance(text, str):
        return ""  # Return empty string for non-string inputs
    # Remove URLs, special characters, and numbers
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)      # Remove numbers
    
    # Remove extra spaces
    text = text.strip()
   
    return text

##  Applying the wordpre method to the dataset
dskb['Article']=dskb['Article'].apply(wordpre)
dskb

Unnamed: 0,Article,label
0,شکستی ڕاهێنەرە یان پێکهاتە ئارسیناڵ لەم وەرزەد...,0
1,پۆلیسی دهۆک جادوگەرێکی دەستگیرکرد بزانن چی کردووە,1
2,سەرکەوت شەمسەدین نەوەی نوێ ئێستا هێزێکی ناڕەسە...,1
3,وریابە تێنەکویت درۆی حیزب وسکیولارەکان,1
4,قبوڵە بەرای تۆ جەعفەر شێخ مستەفا نزا دەکەم یەز...,1
...,...,...
100417,گەنجێک هەڵەبجە کۆتایی ژیانی خۆی هێنا نیوەڕوی ئ...,1
100418,فراکسیۆنەکانی گۆڕان پەرلەمانی کوردستان ئەنجومە...,0
100419,دەیان هاوڵاتی ئێرانی لیستی موچەی شەهیداندان,0
100420,باشترین ڕێزلێنان پەرلەمانی کوردستان ڕێزلێنان ئ...,1


In [8]:
dskb['label1'] = "__label__" + dskb['label'].astype(str)

In [9]:
dskb['label_description'] = dskb['label1'].astype(str) + " " + dskb['Article'].astype(str)
dskb

Unnamed: 0,Article,label,label1,label_description
0,شکستی ڕاهێنەرە یان پێکهاتە ئارسیناڵ لەم وەرزەد...,0,__label__0,__label__0 شکستی ڕاهێنەرە یان پێکهاتە ئارسیناڵ...
1,پۆلیسی دهۆک جادوگەرێکی دەستگیرکرد بزانن چی کردووە,1,__label__1,__label__1 پۆلیسی دهۆک جادوگەرێکی دەستگیرکرد ب...
2,سەرکەوت شەمسەدین نەوەی نوێ ئێستا هێزێکی ناڕەسە...,1,__label__1,__label__1 سەرکەوت شەمسەدین نەوەی نوێ ئێستا هێ...
3,وریابە تێنەکویت درۆی حیزب وسکیولارەکان,1,__label__1,__label__1 وریابە تێنەکویت درۆی حیزب وسکیولارەکان
4,قبوڵە بەرای تۆ جەعفەر شێخ مستەفا نزا دەکەم یەز...,1,__label__1,__label__1 قبوڵە بەرای تۆ جەعفەر شێخ مستەفا نز...
...,...,...,...,...
100417,گەنجێک هەڵەبجە کۆتایی ژیانی خۆی هێنا نیوەڕوی ئ...,1,__label__1,__label__1 گەنجێک هەڵەبجە کۆتایی ژیانی خۆی هێن...
100418,فراکسیۆنەکانی گۆڕان پەرلەمانی کوردستان ئەنجومە...,0,__label__0,__label__0 فراکسیۆنەکانی گۆڕان پەرلەمانی کوردس...
100419,دەیان هاوڵاتی ئێرانی لیستی موچەی شەهیداندان,0,__label__0,__label__0 دەیان هاوڵاتی ئێرانی لیستی موچەی شە...
100420,باشترین ڕێزلێنان پەرلەمانی کوردستان ڕێزلێنان ئ...,1,__label__1,__label__1 باشترین ڕێزلێنان پەرلەمانی کوردستان...


In [10]:
train, test = train_test_split(dskb, test_size = 0.2)

In [11]:
train.to_csv("fake_news_train.txt", columns = ['label_description'], index=False, sep=' ', header=False,
    quoting=3, escapechar=' ', mode='w')
test.to_csv("fake_news_test.txt", columns = ['label_description'], index=False, sep=' ', header=False,
    quoting=3, escapechar=' ', mode='w')

In [12]:
# Train FastText Model
fasttext_model = fasttext.train_supervised(input="fake_news_train.txt", lr=0.5, epoch=25, wordNgrams=2, dim=300)
y_predic = fasttext_model.test("fake_news_test.txt")

In [13]:
N = y_predic[0]
P = y_predic[1]
R = y_predic[2]

print(f"No. of Test: {N:}")
print(f"Precision: {P:.6f}")
print(f"Recall: {R:.6f}")

print(f"F1-Score: {2*((P*R)/(P+R)):.6f}")


No. of Test: 20085
Precision: 0.868559
Recall: 0.868559
F1-Score: 0.868559


In [14]:
# Function to get FastText vector safely
def fasttext_vector(text):
    if isinstance(text, str):  # Ensure text is a string
        text = text.replace("\n", " ").strip()  # Remove newlines
        return fasttext_model.get_sentence_vector(text)
    return np.zeros(300)  # Return zero vector for empty/non-string values

# Apply FastText vectors to dataset
X_fasttext = np.array([fasttext_vector(text) for text in dskb['Article']])
# Function to get FastText vector
y = np.array(dskb['label'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_fasttext, y, test_size=0.2, random_state=42)

In [15]:
# Tokenization
max_words = 10000
max_len = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(dskb['Article'])
X_sequences = tokenizer.texts_to_sequences(dskb['Article'])
X_padded = pad_sequences(X_sequences, maxlen=max_len)


In [16]:
# Embedding Matrix from FastText
embedding_matrix = np.zeros((max_words, 300))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_matrix[i] = fasttext_model.get_word_vector(word)


In [17]:
# LSTM Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=300, weights=[embedding_matrix], input_length=max_len, trainable=True),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    LSTM(32, dropout=0.2, recurrent_dropout=0.2),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])





In [18]:
# Train Model
model.fit(X_padded, y, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate
y_pred = (model.predict(X_padded) > 0.5).astype(int)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

print(f"Hybrid FastText-LSTM Model → Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

Epoch 1/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m429s[0m 338ms/step - accuracy: 0.8176 - loss: 0.4225 - val_accuracy: 0.8548 - val_loss: 0.3427
Epoch 2/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m416s[0m 331ms/step - accuracy: 0.8759 - loss: 0.3080 - val_accuracy: 0.8600 - val_loss: 0.3447
Epoch 3/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 330ms/step - accuracy: 0.9023 - loss: 0.2511 - val_accuracy: 0.8635 - val_loss: 0.3542
Epoch 4/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m410s[0m 326ms/step - accuracy: 0.9221 - loss: 0.2006 - val_accuracy: 0.8656 - val_loss: 0.3885
Epoch 5/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 331ms/step - accuracy: 0.9374 - loss: 0.1634 - val_accuracy: 0.8631 - val_loss: 0.4254
[1m3139/3139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 55ms/step
Hybrid FastText-LSTM Model → Accuracy: 0.9383, Precision: 0.9520, Recall: 0.9232, F1-

In [17]:
# LSTM Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=300, weights=[embedding_matrix], input_length=max_len, trainable=True),
    LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
    #LSTM(32, dropout=0.2, recurrent_dropout=0.2),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [18]:
# Train Model
model.fit(X_padded, y, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate
y_pred = (model.predict(X_padded) > 0.5).astype(int)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

print(f"Hybrid FastText-LSTM Model → Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

Epoch 1/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m279s[0m 219ms/step - accuracy: 0.8239 - loss: 0.4101 - val_accuracy: 0.8547 - val_loss: 0.3432
Epoch 2/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 218ms/step - accuracy: 0.8754 - loss: 0.3030 - val_accuracy: 0.8615 - val_loss: 0.3358
Epoch 3/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 219ms/step - accuracy: 0.9034 - loss: 0.2394 - val_accuracy: 0.8623 - val_loss: 0.3550
Epoch 4/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 212ms/step - accuracy: 0.9244 - loss: 0.1894 - val_accuracy: 0.8648 - val_loss: 0.3927
Epoch 5/5
[1m1256/1256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 216ms/step - accuracy: 0.9428 - loss: 0.1459 - val_accuracy: 0.8635 - val_loss: 0.4471
[1m3139/3139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 39ms/step
Hybrid FastText-LSTM Model → Accuracy: 0.9425, Precision: 0.9547, Recall: 0.9291, F1-