In [1]:
import fasttext
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import string
from collections import Counter

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load dataset
dsk = pd.read_excel('/kaggle/input/kurdishkdfnd/KDFND_Anlyzed_Cleaned_Filtered_Labeld.xlsx')


In [4]:
dsk = dsk.dropna(subset=['Text'])
dsk["Article"] = dsk["Text"]
dsk['label'] = dsk['label'].map({'Real': 0, 'Fake': 1})  # Convert labels to 0 and 1
dsk = dsk[['Article', 'label']].dropna()


In [5]:
from sklearn.utils import resample
# Assuming 'dsk' is your DataFrame and you have a binary label column called 'label'
# Split the dataset into majority and minority classes
majority_class = dsk[dsk['label'] == 0]
minority_class = dsk[dsk['label'] == 1]

# Perform oversampling on the minority class # Sample with replacement  # Match majority size
###minority_oversampled = resample(minority_class,replace=True, n_samples=len(majority_class), random_state=42)  # For reproducibility
# Perform undersampling on the minority class
majority_undersampled = resample(majority_class,replace=True, n_samples=len(minority_class), random_state=42)  # For reproducibility

# Combine majority class with the oversampled minority class
###dskb = pd.concat([majority_class, minority_oversampled])
# Combine majority class with the undersampled minority class
#dskb = pd.concat([minority_class, majority_undersampled])

# Shuffle the dataset
#dskb = dskb.sample(frac=1, random_state=42).reset_index(drop=True)
#print("Balanced class distribution:")
#print(dskb['label'].value_counts())

In [8]:
# Tokenize and count word frequencies
all_words = ' '.join(dsk).split()
word_freq = Counter(all_words)

# Identify potential stopwords (e.g., words appearing very frequently)
potential_stopwords = [word for word, freq in word_freq.items() if freq > 1]

# Example stopwords list (refined manually)
kurdish_stopwords = [
     "ئێمە","ئێوە","ئەم","ئەو"
     ,"ئەوان","ئەوەی","بۆ","بێ","بێجگە","بە","بەبێ","بەدەم","بەردەم","بەرلە","بەرەوی","بەرەوە","بەلای","بەپێی","تۆ","تێ","جگە","دوای","دوو","دە"
     ,"دەکات","دەگەڵ","سەر","لێ","لە","لەبابەت","لەباتی","لەبارەی","لەبرێتی","لەبن","لەبەر","لەبەینی","لەدەم","لەرێ","لەرێگا","لەرەوی","لەسەر","لەلایەن"
     ,"لەناو","لەنێو","لەو","لەپێناوی","لەژێر","لەگەڵ","من","ناو","نێوان","هەر","هەروەها","و","وەک","پاش","پێ","پێش","چەند","کرد","کە","ی"

] + potential_stopwords

kupunctuation = {'!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?',
                 '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'}

# Convert set to list before concatenation
ku_stopwords = kurdish_stopwords + list(kupunctuation)

def rremove_stopwords(text, stop_words):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)
    
# Apply the function to each article
dsk['Article'] = dsk['Article'].apply(lambda text: rremove_stopwords(text, ku_stopwords))
dsk

Unnamed: 0,Article,label
0,ئەرشیف هاونیشتمانی بەیەکەوە پاسپۆرتەکانی عێراق...,1
1,هێشتا هەرجوانە دڵێکی بۆدانێن,1
2,نەخۆشخانەی ڕانییە قەرەبالغیەکی یەکجار زۆر هەی،...,1
3,ئێستا ڕانیە,1
4,هاوڕی باشەکان گرنگی یەکتر دەدەن هاوڕێ نزیکەکان...,1
...,...,...
100957,کەرکووک؛ پیاوێکی 52 ساڵ گوشاری هاوژینەکەیدا ما...,0
100958,تەقینەوەیەک ناوچەی سەوزی بەغدا ڕوویدا,0
100959,باسیان لەچی کرد؟ زانیاری ورد بخوێنەوە پاپاوە س...,0
100960,ئێران گیانلەدەستدانی خۆپیشاندەرێکی ڕاگەیاند,0


In [9]:
def wordpre(text):
    if not isinstance(text, str):
        return ""  # Return empty string for non-string inputs
    # Remove URLs, special characters, and numbers
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)      # Remove numbers
    
    # Remove extra spaces
    text = text.strip()
   
    return text

##  Applying the wordpre method to the dataset
dsk['Article']=dsk['Article'].apply(wordpre)
dsk

Unnamed: 0,Article,label
0,ئەرشیف هاونیشتمانی بەیەکەوە پاسپۆرتەکانی عێراق...,1
1,هێشتا هەرجوانە دڵێکی بۆدانێن,1
2,نەخۆشخانەی ڕانییە قەرەبالغیەکی یەکجار زۆر هەیب...,1
3,ئێستا ڕانیە,1
4,هاوڕی باشەکان گرنگی یەکتر دەدەن هاوڕێ نزیکەکان...,1
...,...,...
100957,کەرکووک پیاوێکی ساڵ گوشاری هاوژینەکەیدا ماڵ د...,0
100958,تەقینەوەیەک ناوچەی سەوزی بەغدا ڕوویدا,0
100959,باسیان لەچی کرد زانیاری ورد بخوێنەوە پاپاوە سە...,0
100960,ئێران گیانلەدەستدانی خۆپیشاندەرێکی ڕاگەیاند,0


In [10]:
dsk['label1'] = "__label__" + dsk['label'].astype(str)

In [11]:
dsk['label_description'] = dsk['label1'].astype(str) + " " + dsk['Article'].astype(str)
dsk

Unnamed: 0,Article,label,label1,label_description
0,ئەرشیف هاونیشتمانی بەیەکەوە پاسپۆرتەکانی عێراق...,1,__label__1,__label__1 ئەرشیف هاونیشتمانی بەیەکەوە پاسپۆرت...
1,هێشتا هەرجوانە دڵێکی بۆدانێن,1,__label__1,__label__1 هێشتا هەرجوانە دڵێکی بۆدانێن
2,نەخۆشخانەی ڕانییە قەرەبالغیەکی یەکجار زۆر هەیب...,1,__label__1,__label__1 نەخۆشخانەی ڕانییە قەرەبالغیەکی یەکج...
3,ئێستا ڕانیە,1,__label__1,__label__1 ئێستا ڕانیە
4,هاوڕی باشەکان گرنگی یەکتر دەدەن هاوڕێ نزیکەکان...,1,__label__1,__label__1 هاوڕی باشەکان گرنگی یەکتر دەدەن هاو...
...,...,...,...,...
100957,کەرکووک پیاوێکی ساڵ گوشاری هاوژینەکەیدا ماڵ د...,0,__label__0,__label__0 کەرکووک پیاوێکی ساڵ گوشاری هاوژینە...
100958,تەقینەوەیەک ناوچەی سەوزی بەغدا ڕوویدا,0,__label__0,__label__0 تەقینەوەیەک ناوچەی سەوزی بەغدا ڕوویدا
100959,باسیان لەچی کرد زانیاری ورد بخوێنەوە پاپاوە سە...,0,__label__0,__label__0 باسیان لەچی کرد زانیاری ورد بخوێنەو...
100960,ئێران گیانلەدەستدانی خۆپیشاندەرێکی ڕاگەیاند,0,__label__0,__label__0 ئێران گیانلەدەستدانی خۆپیشاندەرێکی ...


In [12]:
train, test = train_test_split(dsk, test_size = 0.2)

In [13]:
train.to_csv("fake_news_train.txt", columns = ['label_description'], index=False, sep=' ', header=False,
    quoting=3, escapechar=' ', mode='w')
test.to_csv("fake_news_test.txt", columns = ['label_description'], index=False, sep=' ', header=False,
    quoting=3, escapechar=' ', mode='w')

In [14]:
# Train FastText Model
fasttext_model = fasttext.train_supervised(input="fake_news_train.txt", lr=0.5, epoch=25, wordNgrams=2, dim=300)
y_predic = fasttext_model.test("fake_news_test.txt")

In [15]:
N = y_predic[0]
P = y_predic[1]
R = y_predic[2]

print(f"No. of Test: {N:}")
print(f"Precision: {P:.6f}")
print(f"Recall: {R:.6f}")

print(f"F1-Score: {2*((P*R)/(P+R)):.6f}")


No. of Test: 20193
Precision: 0.802060
Recall: 0.802060
F1-Score: 0.802060


In [16]:
# Function to get FastText vector safely
def fasttext_vector(text):
    if isinstance(text, str):  # Ensure text is a string
        text = text.replace("\n", " ").strip()  # Remove newlines
        return fasttext_model.get_sentence_vector(text)
    return np.zeros(300)  # Return zero vector for empty/non-string values

# Apply FastText vectors to dataset
X_fasttext = np.array([fasttext_vector(text) for text in dsk['Article']])
# Function to get FastText vector
y = np.array(dsk['label'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_fasttext, y, test_size=0.2, random_state=42)

In [17]:
# Tokenization
max_words = 10000
max_len = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(dsk['Article'])
X_sequences = tokenizer.texts_to_sequences(dsk['Article'])
X_padded = pad_sequences(X_sequences, maxlen=max_len)


In [18]:
# Embedding Matrix from FastText
embedding_matrix = np.zeros((max_words, 300))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_matrix[i] = fasttext_model.get_word_vector(word)


In [22]:
# LSTM Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=300, weights=[embedding_matrix], input_length=max_len, trainable=True),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    LSTM(32, dropout=0.2, recurrent_dropout=0.2),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])





In [23]:
# Train Model
model.fit(X_padded, y, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate
y_pred = (model.predict(X_padded) > 0.5).astype(int)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

print(f"Hybrid FastText-LSTM Model → Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

Epoch 1/5
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m564s[0m 442ms/step - accuracy: 0.8185 - loss: 0.4216 - val_accuracy: 0.7963 - val_loss: 0.4648
Epoch 2/5
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m552s[0m 437ms/step - accuracy: 0.8633 - loss: 0.3301 - val_accuracy: 0.7674 - val_loss: 0.5823
Epoch 3/5
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m549s[0m 435ms/step - accuracy: 0.8826 - loss: 0.2842 - val_accuracy: 0.7761 - val_loss: 0.5189
Epoch 4/5
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m548s[0m 434ms/step - accuracy: 0.9023 - loss: 0.2376 - val_accuracy: 0.7787 - val_loss: 0.5993
Epoch 5/5
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m543s[0m 430ms/step - accuracy: 0.9220 - loss: 0.1936 - val_accuracy: 0.7821 - val_loss: 0.6207
[1m3156/3156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 79ms/step
Hybrid FastText-LSTM Model → Accuracy: 0.9092, Precision: 0.8810, Recall: 0.9450, F1-

In [19]:
# LSTM Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=300, weights=[embedding_matrix], input_length=max_len, trainable=True),
    LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
    #LSTM(32, dropout=0.2, recurrent_dropout=0.2),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [20]:
# Train Model
model.fit(X_padded, y, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate
y_pred = (model.predict(X_padded) > 0.5).astype(int)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

print(f"Hybrid FastText-LSTM Model → Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

Epoch 1/5
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 204ms/step - accuracy: 0.8167 - loss: 0.4154 - val_accuracy: 0.7673 - val_loss: 0.5132
Epoch 2/5
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 202ms/step - accuracy: 0.8670 - loss: 0.3202 - val_accuracy: 0.7961 - val_loss: 0.4622
Epoch 3/5
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 200ms/step - accuracy: 0.8876 - loss: 0.2699 - val_accuracy: 0.7931 - val_loss: 0.5399
Epoch 4/5
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 199ms/step - accuracy: 0.9055 - loss: 0.2249 - val_accuracy: 0.7280 - val_loss: 0.7752
Epoch 5/5
[1m1263/1263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 201ms/step - accuracy: 0.9260 - loss: 0.1780 - val_accuracy: 0.7808 - val_loss: 0.7903
[1m3156/3156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 38ms/step
Hybrid FastText-LSTM Model → Accuracy: 0.9142, Precision: 0.8868, Recall: 0.9485, F1-