## Importing libraries and observe input folders in Kaggle notebook

In [None]:
# Linear algebra
import numpy as np

# Data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import subprocess
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

# Now you can import the NLTK resources as usual
from nltk.corpus import wordnet

import tensorflow as tf
print("The TensorFlow version is: ", tf.__version__)
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM,Bidirectional
from tensorflow.keras.layers import Dense, Dropout

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Observe input folders in Kaggle notebook
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Read WELFake_Dataset.csv from kaggle/input

In [None]:
# Read WELFake_Dataset.csv from kaggle/input
df = pd.read_csv("/kaggle/input/fake-news-classification/WELFake_Dataset.csv")
# Show first 5 rows of Dataframe
df.head()

### EDA(Exploratory Data Analysis)

In [None]:
df.info()

### Missing data analysis

In [None]:
# Missing data analysis
df.isnull().sum()

In [None]:
# Drop NA values
df = df.dropna()

# Missing data analysis again
df.isnull().sum()

### Drop unused columns

In [None]:
df.drop(columns=['Unnamed: 0'],inplace=True)
df.head()

### Class distribution

In [None]:
# Class distribution
# 0 - Fake, 1 - Real
df['label'].value_counts().plot.pie(autopct='%.2f')

**From the graph we understand that we have balanced data.**

### Define X and y variables

In [None]:
X = df.drop(columns=['label'])
y = df['label']

In [None]:
# Vocab_size = Unique words in our Corpus (entire document)
vocab_size = 10000

In [None]:
messages = X.copy()

# We have to reset index as we have used dropna() earlier, otherwise it will throw an error
messages.reset_index(inplace=True)

### Lemmatization

In [None]:
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords
lm = WordNetLemmatizer()

stopwords = stopwords.words('english')
corpus = []
for i in range(len(messages)):
    review = re.sub('^a-zA-Z0-9',' ',messages['title'][i])
    review = review.lower()
    review = review.split()
    review =[lm.lemmatize(x) for x in review if x not in stopwords]
    review = " ".join(review)
    corpus.append(review)

In [None]:
max_length = max(len(sentence.split()) for sentence in corpus)

print("Maximum sentence length:", max_length)

In [None]:
len(corpus)

### TF-IDF Vectorization

In [None]:
tf =TfidfVectorizer()
x=tf.fit_transform(corpus).toarray()

In [None]:
y = df["label"]
y.head()

### Splitting into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train,y_train)

### Performance Metrics

In [None]:
y_pred=rf.predict(X_test)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Confusion matrix oluştur
cm = confusion_matrix(y_test, y_pred)

# Matrisi görselleştirme
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, square=True)
plt.xlabel('Predicted Class')
plt.ylabel('Real Class')
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Verilen confusion matrix
conf_matrix = [[6226, 780],
               [597, 6705]]

# Hesaplamalar
true_negatives = conf_matrix[0][0]
false_positives = conf_matrix[0][1]
false_negatives = conf_matrix[1][0]
true_positives = conf_matrix[1][1]

# Precision, Recall, F1-score ve Accuracy hesaplamaları
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1 = 2 * (precision * recall) / (precision + recall)
accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)

# Sonuçları yazdırma
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')
print(f'Accuracy: {accuracy:.4f}')