In [1]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, matthews_corrcoef, hamming_loss



In [2]:
import pandas as pd

# Load the files into pandas DataFrames
file_paths = [
    "/kaggle/input/banfakenews/Authentic-48K.csv",
    "/kaggle/input/banfakenews/Fake-1K.csv",
    "/kaggle/input/banfakenews/LabeledAuthentic-7K.csv",
    "/kaggle/input/banfakenews/LabeledFake-1K.csv"
]


dataframes = {file_path: pd.read_csv(file_path) for file_path in file_paths}
first_rows = {file_path: df.head() for file_path, df in dataframes.items()}
first_rows


{'/kaggle/input/banfakenews/Authentic-48K.csv':    articleID          domain                 date   category  \
 0          1  jagonews24.com  2018-09-19 17:48:18  Education   
 1          2  jagonews24.com  2018-09-19 17:48:19   National   
 2          3  jagonews24.com  2018-09-19 17:48:20   National   
 3          4  jagonews24.com  2018-09-19 17:48:21      Crime   
 4          5  jagonews24.com  2018-09-19 17:48:21   National   
 
                                             headline  \
 0   হট্টগোল করায় বাকৃবিতে দুইজন বরখাস্ত, ৬ জনকে শোকজ   
 1    মালয়েশিয়ায় কর্মী পাঠানোর ব্যবস্থা নেয়ার সুপারিশ   
 2  প্রেমের প্রস্তাবে রাজি না হওয়ায় স্কুলছাত্রীকে ...   
 3  মেডিয়েশনই মামলাজট নিরসনের পথ : বিচারপতি আহমেদ ...   
 4         টকশোতে বক্তব্য দিতে গিয়ে জাপা নেতার মৃত্যু   
 
                                              content  label  
 0  গত ১৭ সেপ্টেম্বর বাংলাদেশ কৃষি বিশ্ববিদ্যালয়ে ...      1  
 1  বাংলাদেশের বৃহৎ শ্রমবাজার মালয়েশিয়ায় আবার শ্রম...      1  
 2  নরসিংদীর মনোহরদীতে প্রেম

# With 0.05 feature extraction significance level.


# Logistic Regression with 0.05 feature extraction

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

authentic_df = pd.concat([dataframes["/kaggle/input/banfakenews/Authentic-48K.csv"], dataframes["/kaggle/input/banfakenews/LabeledAuthentic-7K.csv"]])
fake_df = pd.concat([dataframes["/kaggle/input/banfakenews/Fake-1K.csv"], dataframes["/kaggle/input/banfakenews/LabeledFake-1K.csv"]])

authentic_df['label'] = 1
fake_df['label'] = 0

combined_df = pd.concat([authentic_df, fake_df])

X = combined_df['content'].fillna("") 
y = combined_df['label']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_df=0.95, min_df=0.05)  # Applying the 0.05 feature extraction threshold
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

log_reg_model = LogisticRegression(random_state=42)
log_reg_model.fit(X_train_tfidf, y_train)

predictions = log_reg_model.predict(X_test_tfidf)
evaluation_report = classification_report(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

In [4]:
predictions = log_reg_model.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, predictions))
print("Accuracy:", accuracy_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))
print("Recall:", recall_score(y_test, predictions))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, predictions))
print("Hamming Loss:", hamming_loss(y_test, predictions))

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.41      0.56       536
           1       0.97      1.00      0.98     11160

    accuracy                           0.97     11696
   macro avg       0.91      0.70      0.77     11696
weighted avg       0.97      0.97      0.96     11696

Accuracy: 0.9699042407660738
Precision: 0.9724602203182374
Recall: 0.9966845878136201
Matthews Correlation Coefficient: 0.5823053755300048
Hamming Loss: 0.030095759233926128


# Random Forest Classifier with 0.05 feature extraction

In [5]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_tfidf, y_train)

In [6]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, matthews_corrcoef, hamming_loss

rf_predictions = rf_model.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, rf_predictions))
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print("Precision:", precision_score(y_test, rf_predictions))
print("Recall:", recall_score(y_test, rf_predictions))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, rf_predictions))
print("Hamming Loss:", hamming_loss(y_test, rf_predictions))


Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.83      0.91       536
           1       0.99      1.00      1.00     11160

    accuracy                           0.99     11696
   macro avg       1.00      0.92      0.95     11696
weighted avg       0.99      0.99      0.99     11696

Accuracy: 0.992219562243502
Precision: 0.9919118300595503
Recall: 1.0
Matthews Correlation Coefficient: 0.9074739052788937
Hamming Loss: 0.007780437756497948


In [7]:
from sklearn.svm import SVC
svm_model = SVC(random_state=42)
svm_model.fit(X_train_tfidf, y_train)


In [8]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, matthews_corrcoef, hamming_loss

svm_predictions = svm_model.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, svm_predictions))
print("Accuracy:", accuracy_score(y_test, svm_predictions))
print("Precision:", precision_score(y_test, svm_predictions))
print("Recall:", recall_score(y_test, svm_predictions))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, svm_predictions))
print("Hamming Loss:", hamming_loss(y_test, svm_predictions))


Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.73      0.83       536
           1       0.99      1.00      0.99     11160

    accuracy                           0.99     11696
   macro avg       0.97      0.87      0.91     11696
weighted avg       0.99      0.99      0.99     11696

Accuracy: 0.9860636114911081
Precision: 0.9873260657626518
Recall: 0.9982078853046595
Matthews Correlation Coefficient: 0.8286900565778993
Hamming Loss: 0.01393638850889193


In [9]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_tfidf, y_train)

In [10]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, matthews_corrcoef, hamming_loss

xgb_predictions = xgb_model.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, xgb_predictions))
print("Accuracy:", accuracy_score(y_test, xgb_predictions))
print("Precision:", precision_score(y_test, xgb_predictions))
print("Recall:", recall_score(y_test, xgb_predictions))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, xgb_predictions))
print("Hamming Loss:", hamming_loss(y_test, xgb_predictions))


Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.86      0.91       536
           1       0.99      1.00      1.00     11160

    accuracy                           0.99     11696
   macro avg       0.98      0.93      0.95     11696
weighted avg       0.99      0.99      0.99     11696

Accuracy: 0.9920485636114911
Precision: 0.9934016941596077
Recall: 0.9982974910394266
Matthews Correlation Coefficient: 0.9058594761941927
Hamming Loss: 0.007951436388508893


In [11]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_tfidf, y_train)

In [12]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, matthews_corrcoef, hamming_loss

dt_predictions = dt_model.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, dt_predictions))
print("Accuracy:", accuracy_score(y_test, dt_predictions))
print("Precision:", precision_score(y_test, dt_predictions))
print("Recall:", recall_score(y_test, dt_predictions))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, dt_predictions))
print("Hamming Loss:", hamming_loss(y_test, dt_predictions))


Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.87      0.79       536
           1       0.99      0.98      0.99     11160

    accuracy                           0.98     11696
   macro avg       0.86      0.93      0.89     11696
weighted avg       0.98      0.98      0.98     11696

Accuracy: 0.9792236662106704
Precision: 0.9938478241201484
Recall: 0.9843189964157706
Matthews Correlation Coefficient: 0.7866461280006288
Hamming Loss: 0.020776333789329686


In [13]:
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_tfidf, y_train)


In [14]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, matthews_corrcoef, hamming_loss

gb_predictions = gb_model.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, gb_predictions))
print("Accuracy:", accuracy_score(y_test, gb_predictions))
print("Precision:", precision_score(y_test, gb_predictions))
print("Recall:", recall_score(y_test, gb_predictions))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, gb_predictions))
print("Hamming Loss:", hamming_loss(y_test, gb_predictions))


Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.30      0.44       536
           1       0.97      1.00      0.98     11160

    accuracy                           0.96     11696
   macro avg       0.89      0.65      0.71     11696
weighted avg       0.96      0.96      0.96     11696

Accuracy: 0.9645177838577291
Precision: 0.9672958162999044
Recall: 0.996505376344086
Matthews Correlation Coefficient: 0.47702001887280593
Hamming Loss: 0.03548221614227086


In [15]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_tfidf, y_train)
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, matthews_corrcoef, hamming_loss

knn_predictions = knn_model.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, knn_predictions))
print("Accuracy:", accuracy_score(y_test, knn_predictions))
print("Precision:", precision_score(y_test, knn_predictions))
print("Recall:", recall_score(y_test, knn_predictions))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, knn_predictions))
print("Hamming Loss:", hamming_loss(y_test, knn_predictions))


Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.39      0.54       536
           1       0.97      1.00      0.98     11160

    accuracy                           0.97     11696
   macro avg       0.91      0.70      0.76     11696
weighted avg       0.97      0.97      0.96     11696

Accuracy: 0.9688782489740082
Precision: 0.9716058011532414
Recall: 0.996505376344086
Matthews Correlation Coefficient: 0.5641085589199772
Hamming Loss: 0.03112175102599179


In [16]:
from sklearn.ensemble import AdaBoostClassifier
adaboost_model = AdaBoostClassifier(random_state=42)
adaboost_model.fit(X_train_tfidf, y_train)
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, matthews_corrcoef, hamming_loss

adaboost_predictions = adaboost_model.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, adaboost_predictions))
print("Accuracy:", accuracy_score(y_test, adaboost_predictions))
print("Precision:", precision_score(y_test, adaboost_predictions))
print("Recall:", recall_score(y_test, adaboost_predictions))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, adaboost_predictions))
print("Hamming Loss:", hamming_loss(y_test, adaboost_predictions))


Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.35      0.46       536
           1       0.97      0.99      0.98     11160

    accuracy                           0.96     11696
   macro avg       0.81      0.67      0.72     11696
weighted avg       0.95      0.96      0.96     11696

Accuracy: 0.9614398084815321
Precision: 0.9695694115583618
Recall: 0.9906810035842294
Matthews Correlation Coefficient: 0.4593418304903668
Hamming Loss: 0.03856019151846785


In [17]:
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier(random_state=42)
sgd_model.fit(X_train_tfidf, y_train)
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, matthews_corrcoef, hamming_loss

sgd_predictions = sgd_model.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, sgd_predictions))
print("Accuracy:", accuracy_score(y_test, sgd_predictions))
print("Precision:", precision_score(y_test, sgd_predictions))
print("Recall:", recall_score(y_test, sgd_predictions))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, sgd_predictions))
print("Hamming Loss:", hamming_loss(y_test, sgd_predictions))


Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.22      0.36       536
           1       0.96      1.00      0.98     11160

    accuracy                           0.96     11696
   macro avg       0.96      0.61      0.67     11696
weighted avg       0.96      0.96      0.95     11696

Accuracy: 0.9638337893296853
Precision: 0.9639616282084522
Recall: 0.9994623655913979
Matthews Correlation Coefficient: 0.4504050399840903
Hamming Loss: 0.036166210670314634


In [18]:
from sklearn.svm import LinearSVC
linear_svc_model = LinearSVC(random_state=42)
linear_svc_model.fit(X_train_tfidf, y_train)
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, matthews_corrcoef, hamming_loss

linear_svc_predictions = linear_svc_model.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, linear_svc_predictions))
print("Accuracy:", accuracy_score(y_test, linear_svc_predictions))
print("Precision:", precision_score(y_test, linear_svc_predictions))
print("Recall:", recall_score(y_test, linear_svc_predictions))
print("Matthews Correlation Coefficient:", matthews_corrcoef(y_test, linear_svc_predictions))
print("Hamming Loss:", hamming_loss(y_test, linear_svc_predictions))


Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.46      0.59       536
           1       0.97      1.00      0.98     11160

    accuracy                           0.97     11696
   macro avg       0.90      0.73      0.79     11696
weighted avg       0.97      0.97      0.97     11696

Accuracy: 0.9706737346101231
Precision: 0.9744714448635845
Recall: 0.9953405017921146
Matthews Correlation Coefficient: 0.6013874172760255
Hamming Loss: 0.02932626538987688
