In [1]:
# Data loading and preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Download Data
df = pd.read_csv('WELFake_Dataset.csv')

# Select required columns and remove missing values
df = df[['text', 'label']].dropna()

# Split the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [3]:
# Text vectorization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# CountVectorizer vectorization
count_vectorizer = CountVectorizer(stop_words='english')
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# TfidfVectorizer vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [4]:
# Train and evaluate classification models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [5]:
# Using CountVectorizer and LogisticRegression
log_reg_count = LogisticRegression(max_iter=1000)
log_reg_count.fit(X_train_counts, y_train)
y_pred_count_lr = log_reg_count.predict(X_test_counts)
print("Using CountVectorizer and LogisticRegression")
print("Accuracy:", accuracy_score(y_test, y_pred_count_lr))
print(classification_report(y_test, y_pred_count_lr))

Using CountVectorizer and LogisticRegression
Accuracy: 0.9525625910257299
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      7010
           1       0.95      0.96      0.95      7409

    accuracy                           0.95     14419
   macro avg       0.95      0.95      0.95     14419
weighted avg       0.95      0.95      0.95     14419



In [6]:
# Using TfidfVectorizer and LogisticRegression
log_reg_tfidf = LogisticRegression(max_iter=1000)
log_reg_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf_lr = log_reg_tfidf.predict(X_test_tfidf)
print(" UsingTfidfVectorizer and LogisticRegression")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf_lr))
print(classification_report(y_test, y_pred_tfidf_lr))

 UsingTfidfVectorizer and LogisticRegression
Accuracy: 0.9401484152853873
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      7010
           1       0.94      0.95      0.94      7409

    accuracy                           0.94     14419
   macro avg       0.94      0.94      0.94     14419
weighted avg       0.94      0.94      0.94     14419



In [7]:
# Using CountVectorizer and GradientBoostingClassifier
gb_count = GradientBoostingClassifier()
gb_count.fit(X_train_counts, y_train)
y_pred_count_gb = gb_count.predict(X_test_counts)
print(" Using CountVectorizer and GradientBoostingClassifier")
print("Accuracy:", accuracy_score(y_test, y_pred_count_gb))
print(classification_report(y_test, y_pred_count_gb))

 Using CountVectorizer and GradientBoostingClassifier
Accuracy: 0.9185102989111589
              precision    recall  f1-score   support

           0       0.95      0.88      0.91      7010
           1       0.89      0.95      0.92      7409

    accuracy                           0.92     14419
   macro avg       0.92      0.92      0.92     14419
weighted avg       0.92      0.92      0.92     14419



In [8]:
# Using TfidfVectorizer and GradientBoostingClassifier
gb_tfidf = GradientBoostingClassifier()
gb_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf_gb = gb_tfidf.predict(X_test_tfidf)
print(" Using TfidfVectorizer and GradientBoostingClassifier")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf_gb))
print(classification_report(y_test, y_pred_tfidf_gb))

 Using TfidfVectorizer and GradientBoostingClassifier
Accuracy: 0.9194812400305153
              precision    recall  f1-score   support

           0       0.95      0.88      0.91      7010
           1       0.90      0.95      0.92      7409

    accuracy                           0.92     14419
   macro avg       0.92      0.92      0.92     14419
weighted avg       0.92      0.92      0.92     14419

