# Part 4: Validation and cross-domain performance

In [13]:
import numpy as np
from scipy.sparse import csr_matrix, hstack, save_npz
simple_BOW = np.load('../Simple_995_BBC.npz', allow_pickle=True)
simple_ONEHOT = np.load('../Simple_995.npz', allow_pickle=True)
advanced = np.load("../advanced_features.npz", allow_pickle=True)
liar_TFIDF = np.load("../LIAR_TFIDF.npz", allow_pickle=True)
liar_BOW = np.load("../LIAR_BOW.npz", allow_pickle=True)
liar_ONEHOT = np.load("../LIAR_ONEHOT.npz", allow_pickle=True)
ones_ratio = sum(liar_ONEHOT['Y_test'].ravel()) / len(liar_ONEHOT['Y_test'].ravel())
print("Ratio of ones:", ones_ratio)

Ratio of ones: 0.3553081355601133


# Simple models

In [9]:
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
import pickle
import pandas as pd

with open("../simple_model_ONEHOT.pkl",'rb') as f:
    simple_model_ONEHOT = pickle.load(f)

#load features
X_test = simple_ONEHOT['X_test_content_ONEHOT'].item()
Y_test = simple_ONEHOT['Y_test'].ravel()
X_test_LIAR = liar_ONEHOT['X_test'].item()
Y_test_LIAR=liar_ONEHOT['Y_test'].ravel()

#validate model
Y_pred = simple_model_ONEHOT.predict(X_test)
print("Simple model using logistic regression and one-hot encoding on FakeNews dataset:")
df = pd.DataFrame(confusion_matrix(Y_test, Y_pred, labels=[1,0]), index=['True "real news"','True "fake news"'], columns=['Predicted "real news"','Predicted "fake news"'])
print(f"\n{df}\n")
print("Accuracy:",accuracy_score(Y_test,Y_pred))
print("Precision:",precision_score(Y_test,Y_pred,average="binary"))
print("Recall:",recall_score(Y_test,Y_pred,average="binary"))
print("F1 score:",f1_score(Y_test,Y_pred,average="binary"))

#LIAR
print("Simple model using logistic regression and one-hot encoding on LIAR dataset:")
Y_pred_LIAR = simple_model_ONEHOT.predict(X_test_LIAR)
df = pd.DataFrame(confusion_matrix(Y_test_LIAR, Y_pred_LIAR, labels=[1,0]), index=['True "real news"','True "fake news"'], columns=['Predicted "real news"','Predicted "fake news"'])
print(f"\n{df}\n")
print("Accuracy:",accuracy_score(Y_test_LIAR,Y_pred_LIAR))
print("Precision:",precision_score(Y_test_LIAR,Y_pred_LIAR,average="binary"))
print("Recall:",recall_score(Y_test_LIAR,Y_pred_LIAR,average="binary"))
print("F1 score:",f1_score(Y_test_LIAR,Y_pred_LIAR,average="binary"))



Simple model using logistic regression and one-hot encoding on FakeNews dataset:

                  Predicted "real news"  Predicted "fake news"
True "real news"                  40141                   1466
True "fake news"                  33529                   9390

Accuracy: 0.5859853772803635
Precision: 0.5448757974752274
Recall: 0.9647655442593794
F1 score: 0.6964268674583829
Simple model using logistic regression and one-hot encoding on LIAR dataset:

                  Predicted "real news"  Predicted "fake news"
True "real news"                   3638                      0
True "fake news"                   6601                      0

Accuracy: 0.3553081355601133
Precision: 0.3553081355601133
Recall: 1.0
F1 score: 0.5243208186207393


In [10]:
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
import pickle
import pandas as pd

with open("../simple_model_BOW.pkl",'rb') as f:
    simple_model_BOW = pickle.load(f)

#load features
X_test = simple_BOW['X_test'].item()
Y_test = simple_BOW['Y_test'].ravel()
X_test_LIAR = liar_BOW['X_test'].item()
Y_test_LIAR=liar_BOW['Y_test'].ravel()

#validate model
Y_pred = simple_model_BOW.predict(X_test)
print("Simple model using logistic regression and bag-of-words on FakeNews dataset:")
df = pd.DataFrame(confusion_matrix(Y_test, Y_pred, labels=[1,0]), index=['True "real news"','True "fake news"'], columns=['Predicted "real news"','Predicted "fake news"'])
print(f"\n{df}\n")
print("Accuracy:",accuracy_score(Y_test,Y_pred))
print("Precision:",precision_score(Y_test,Y_pred,average="binary"))
print("Recall:",recall_score(Y_test,Y_pred,average="binary"))
print("F1 score:",f1_score(Y_test,Y_pred,average="binary"))
#LIAR
print("Simple model using logistic regression and bag-of-words on LIAR dataset:")
Y_pred_LIAR = simple_model_BOW.predict(X_test_LIAR)
df = pd.DataFrame(confusion_matrix(Y_test_LIAR, Y_pred_LIAR, labels=[1,0]), index=['True "real news"','True "fake news"'], columns=['Predicted "real news"','Predicted "fake news"'])
print(f"\n{df}\n")
print("Accuracy:",accuracy_score(Y_test_LIAR,Y_pred_LIAR))
print("Precision:",precision_score(Y_test_LIAR,Y_pred_LIAR,average="binary"))
print("Recall:",recall_score(Y_test_LIAR,Y_pred_LIAR,average="binary"))
print("F1 score:",f1_score(Y_test_LIAR,Y_pred_LIAR,average="binary"))



Simple model using logistic regression and bag-of-words on FakeNews dataset:

                  Predicted "real news"  Predicted "fake news"
True "real news"                  35673                   6387
True "fake news"                   4434                  38602

Accuracy: 0.8728377362038169
Precision: 0.8894457326651208
Recall: 0.8481455064194009
F1 score: 0.8683047938953594
Simple model using logistic regression and bag-of-words on LIAR dataset:

                  Predicted "real news"  Predicted "fake news"
True "real news"                    247                   3391
True "fake news"                    405                   6196

Accuracy: 0.6292606699873035
Precision: 0.37883435582822084
Recall: 0.06789444749862562
F1 score: 0.11515151515151516


In [11]:
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
import pickle
import pandas as pd

with open("../advanced_model_NB.pkl",'rb') as f:
    advanced_model_NB = pickle.load(f)

#load features
X_test = advanced['X_test'].item()
Y_test = advanced['Y_test'].ravel()
X_test_LIAR = liar_TFIDF['X_test'].item()
Y_test_LIAR=liar_TFIDF['Y_test'].ravel()

#validate model
Y_pred = advanced_model_NB.predict(X_test)
print("Advanced model using multinominal naive bayes and TF-IDF on FakeNews dataset:")
df = pd.DataFrame(confusion_matrix(Y_test, Y_pred, labels=[1,0]), index=['True "real news"','True "fake news"'], columns=['Predicted "real news"','Predicted "fake news"'])
print(f"\n{df}\n")
print("Accuracy:",accuracy_score(Y_test,Y_pred))
print("Precision:",precision_score(Y_test,Y_pred,average="binary"))
print("Recall:",recall_score(Y_test,Y_pred,average="binary"))
print("F1 score:",f1_score(Y_test,Y_pred,average="binary"))

#LIAR
print("Advanced model using  multinominal naive bayes and TF-IDF on LIAR dataset:")
Y_pred_LIAR = advanced_model_NB.predict(X_test_LIAR)
df = pd.DataFrame(confusion_matrix(Y_test_LIAR, Y_pred_LIAR, labels=[1,0]), index=['True "real news"','True "fake news"'], columns=['Predicted "real news"','Predicted "fake news"'])
print(f"\n{df}\n")
print("Accuracy:",accuracy_score(Y_test_LIAR,Y_pred_LIAR))
print("Precision:",precision_score(Y_test_LIAR,Y_pred_LIAR,average="binary"))
print("Recall:",recall_score(Y_test_LIAR,Y_pred_LIAR,average="binary"))
print("F1 score:",f1_score(Y_test_LIAR,Y_pred_LIAR,average="binary"))



Advanced model using multinominal naive bayes and TF-IDF on FakeNews dataset:

                  Predicted "real news"  Predicted "fake news"
True "real news"                  34510                   7550
True "fake news"                   6746                  36290

Accuracy: 0.8320015041835104
Precision: 0.8364843901493116
Recall: 0.8204945316214931
F1 score: 0.8284123097604301
Advanced model using  multinominal naive bayes and TF-IDF on LIAR dataset:

                  Predicted "real news"  Predicted "fake news"
True "real news"                    411                   3227
True "fake news"                    536                   6065

Accuracy: 0.6324836409805645
Precision: 0.43400211193241817
Recall: 0.11297416162726773
F1 score: 0.17928026172300982


In [12]:
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
import pickle
import pandas as pd

with open("../advanced_model_SVC.pkl",'rb') as f:
    advanced_model_SVC = pickle.load(f)

#load features
X_test = advanced['X_test'].item()
Y_test = advanced['Y_test'].ravel()
X_test_LIAR = liar_TFIDF['X_test'].item()
Y_test_LIAR=liar_TFIDF['Y_test'].ravel()

#validate model
Y_pred = advanced_model_SVC.predict(X_test)
print("Advanced model using support vector classifier and TF-IDF on FakeNews dataset:")
df = pd.DataFrame(confusion_matrix(Y_test, Y_pred, labels=[1,0]), index=['True "real news"','True "fake news"'], columns=['Predicted "real news"','Predicted "fake news"'])
print(f"\n{df}\n")
print("Accuracy:",accuracy_score(Y_test,Y_pred))
print("Precision:",precision_score(Y_test,Y_pred,average="binary"))
print("Recall:",recall_score(Y_test,Y_pred,average="binary"))
print("F1 score:",f1_score(Y_test,Y_pred,average="binary"))
#LIAR
print("Advanced model using support vector classifier and TF-IDF on LIAR dataset:")
Y_pred_LIAR = advanced_model_SVC.predict(X_test_LIAR)
df = pd.DataFrame(confusion_matrix(Y_test_LIAR, Y_pred_LIAR, labels=[1,0]), index=['True "real news"','True "fake news"'], columns=['Predicted "real news"','Predicted "fake news"'])
print(f"\n{df}\n")
print("Accuracy:",accuracy_score(Y_test_LIAR,Y_pred_LIAR))
print("Precision:",precision_score(Y_test_LIAR,Y_pred_LIAR,average="binary"))
print("Recall:",recall_score(Y_test_LIAR,Y_pred_LIAR,average="binary"))
print("F1 score:",f1_score(Y_test_LIAR,Y_pred_LIAR,average="binary"))


Advanced model using support vector classifier and TF-IDF on FakeNews dataset:

                  Predicted "real news"  Predicted "fake news"
True "real news"                  36707                   5353
True "fake news"                   4554                  38482

Accuracy: 0.8835785465826831
Precision: 0.8896294321514263
Recall: 0.8727294341417023
F1 score: 0.8810984025635794
Advanced model using support vector classifier and TF-IDF on LIAR dataset:

                  Predicted "real news"  Predicted "fake news"
True "real news"                     95                   3543
True "fake news"                    138                   6463

Accuracy: 0.6404922355698799
Precision: 0.40772532188841204
Recall: 0.02611324903793293
F1 score: 0.04908292430896409
