In [15]:
import pandas as pd 
from sklearn import preprocessing
import re
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import pickle

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
tokenizer = RegexpTokenizer(r'\w+')
 
nltk.download('stopwords')
stopwords_list = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ALI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### KDNugget Dataset Loader

In [4]:
kdnugget_dataset = pd.read_csv("fake_and_real_news_dataset.csv")
kdnugget_dataset["Title with text"] = kdnugget_dataset.apply(lambda x: x["title"] +  x["text"] if type(x["title"]) != float else x["text"]  , axis=1) 
kdnugget_dataset = kdnugget_dataset.to_numpy()
kdnugget_y = kdnugget_dataset[:, 3]
le = preprocessing.LabelEncoder()
le.fit(["REAL", "FAKE"])
kdnugget_y = le.transform(kdnugget_y)
kdnugget_X = kdnugget_dataset[:, 4]
kdnugget_X_postp = []
for i in range(kdnugget_X.shape[0]):
    sentence = kdnugget_X[i]
    sentence = re.sub(r'[^a-zA-Z ]', '', sentence)
    for stop_word in stopwords_list:
        sentence = sentence.replace(" " + stop_word + " ", " ")

    kdnugget_X_postp.append(sentence)
kdnugget_X = kdnugget_X_postp 

#### ISOT Dataset Loader

In [6]:
fake_isot_dataset = pd.read_csv("Fake.csv")
real_isot_dataset = pd.read_csv("True.csv")
fake_isot_dataset['Label'] =  0
real_isot_dataset['Label'] =  1


isot_dataset = pd.concat([fake_isot_dataset, real_isot_dataset], ignore_index=True)
isot_dataset["Title with text"] = isot_dataset["title"] + " " +  isot_dataset["text"]
isot_dataset = isot_dataset.to_numpy()
isot_y = isot_dataset[:, 4]
isot_X = isot_dataset[:, 5]

isot_X_postp = []
for i in range(isot_X.shape[0]):
    sentence = isot_X[i]
    sentence = re.sub(r'[^a-zA-Z ]', '', sentence)
    for stop_word in stopwords_list:
        sentence = sentence.replace(" " + stop_word + " ", " ")

    isot_X_postp.append(sentence)
isot_X = isot_X_postp 

In [7]:
vectorizer = TfidfVectorizer()
isot_X_tfidf = vectorizer.fit_transform(isot_X)
vectorizer = TfidfVectorizer(use_idf=False)
isot_X_tf = vectorizer.fit_transform(isot_X)
vectorizer = TfidfVectorizer()
kdnugget_X_tfidf = vectorizer.fit_transform(kdnugget_X)
vectorizer = TfidfVectorizer(use_idf=False)
kdnugget_X_tf = vectorizer.fit_transform(kdnugget_X)


In [8]:
isot_X_train, isot_X_test, isot_y_train, isot_y_test = train_test_split(isot_X_tfidf, isot_y, test_size=0.2, shuffle=True)
kdnugget_X_train, kdnugget_X_test, kdnugget_y_train, kdnugget_y_test = train_test_split(kdnugget_X_tfidf, kdnugget_y, test_size=0.2, shuffle=True)

# print(isot_X_train)

In [13]:
def eval_model(true_value, predict_value):
    print("accuracy:  ", accuracy_score(true_value, predict_value))
    print("Precision:  ", precision_score(true_value, predict_value))
    print("Recall:  ", recall_score(true_value, predict_value))
    print("F1 score:  ", f1_score(true_value, predict_value))


## LogisticRegression
##### isot

In [16]:
log_model = LogisticRegression(random_state=0).fit(isot_X_train, np.array(isot_y_train).astype(int))
predict_y = log_model.predict(isot_X_test)
eval_model(np.array(isot_y_test).astype(int), predict_y)

accuracy:   0.9920935412026726
Precision:   0.9906629318394025
Recall:   0.9927485380116959
F1 score:   0.9917046383923356


##### KDnugget

In [36]:
log_model_kd = LogisticRegression(C=10).fit(kdnugget_X_train, np.array(kdnugget_y_train).astype(int))
predict_y = log_model_kd.predict(kdnugget_X_test)
eval_model(np.array(kdnugget_y_test).astype(int), predict_y)

accuracy:   0.9314472252448314
Precision:   0.9478260869565217
Recall:   0.9178947368421052
F1 score:   0.932620320855615


In [37]:
pickle.dump(log_model, file = open("log_model_isot.pickle", "wb"))
pickle.dump(log_model_kd, file = open("log_model_kdnugget.pickle", "wb"))

## SVM
##### isot

In [9]:
svm_clf = SVC(C=1)
svm_clf.fit(isot_X_train, np.array(isot_y_train).astype(int))
predict_y = svm_clf.predict(isot_X_test)
eval_model(np.array(isot_y_test).astype(int), predict_y)


accuracy:   0.9962138084632517
Precision:   0.9948658109684948
Recall:   0.9971929824561403
F1 score:   0.9960280373831776


##### KDnugget

In [19]:
svm_clf_kdnugget = SVC(C=1)
svm_clf_kdnugget.fit(kdnugget_X_train, np.array(kdnugget_y_train).astype(int))
predict_y = svm_clf_kdnugget.predict(kdnugget_X_test)
eval_model(np.array(kdnugget_y_test).astype(int), predict_y)

accuracy:   0.9281828073993471
Precision:   0.9534368070953437
Recall:   0.9052631578947369
F1 score:   0.9287257019438445


In [20]:
pickle.dump(svm_clf, file = open("svm_clf_isot.pickle", "wb"))
pickle.dump(svm_clf_kdnugget, file = open("svm_clf_kdnugget.pickle", "wb"))
    

## Decision Tree
##### isot

In [21]:
dt_clf = DecisionTreeClassifier(max_depth=5)
dt_clf.fit(isot_X_train, np.array(isot_y_train).astype(np.int16))
predict_y = dt_clf.predict(isot_X_test)
eval_model(np.array(isot_y_test).astype(int), predict_y)


accuracy:   0.9955456570155902
Precision:   0.9928994082840237
Recall:   0.9976218787158145
F1 score:   0.9952550415183867


##### KDnugget

In [22]:
dt_clf_kdnugget = DecisionTreeClassifier(max_depth=5)
dt_clf_kdnugget.fit(kdnugget_X_train, np.array(kdnugget_y_train).astype(np.int16))
predict_y = dt_clf_kdnugget.predict(kdnugget_X_test)
eval_model(np.array(kdnugget_y_test).astype(int), predict_y)


accuracy:   0.7997823721436343
Precision:   0.8142548596112311
Recall:   0.7936842105263158
F1 score:   0.8038379530916845


In [23]:
pickle.dump(dt_clf, file = open("dt_clf_isot.pickle", "wb"))
pickle.dump(dt_clf_kdnugget, file = open("dt_clf_kdnugget.pickle", "wb"))
    

## KNN
##### isot

In [24]:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(isot_X_train, np.array(isot_y_train).astype(int))
predict_y = knn.predict(isot_X_test)
eval_model(np.array(isot_y_test).astype(int), predict_y)


accuracy:   0.8857461024498886
Precision:   0.8287487073422958
Recall:   0.9529131985731273
F1 score:   0.8865044247787611


##### KDnugget

In [25]:
knn_kdnugget = KNeighborsClassifier(n_neighbors=9)
knn_kdnugget.fit(kdnugget_X_train, np.array(kdnugget_y_train).astype(int))
predict_y = knn_kdnugget.predict(kdnugget_X_test)
eval_model(np.array(kdnugget_y_test).astype(int), predict_y)


accuracy:   0.8737758433079434
Precision:   0.8380414312617702
Recall:   0.9368421052631579
F1 score:   0.8846918489065607


In [26]:
pickle.dump(knn, file = open("knn_isot.pickle", "wb"))
pickle.dump(knn_kdnugget, file = open("knn_kdnugget.pickle", "wb"))
    

## RandomForest 1
##### isot

In [27]:
rf1_clf = RandomForestClassifier(max_depth=40, n_estimators=400).fit(isot_X_train, np.array(isot_y_train).astype(int))
predict_y = rf1_clf.predict(isot_X_test)
eval_model(np.array(isot_y_test).astype(int), predict_y)


accuracy:   0.9909799554565701
Precision:   0.9918893129770993
Recall:   0.9888228299643281
F1 score:   0.9903536977491961


##### KDnugget

In [34]:
rf1_clf_kdnugget = RandomForestClassifier(n_estimators=150).fit(kdnugget_X_train, np.array(kdnugget_y_train).astype(int))
predict_y = rf1_clf_kdnugget.predict(kdnugget_X_test)
eval_model(np.array(kdnugget_y_test).astype(int), predict_y)


accuracy:   0.8846572361262242
Precision:   0.9002169197396963
Recall:   0.8736842105263158
F1 score:   0.8867521367521368


In [35]:
pickle.dump(rf1_clf, file = open("rf1_clf_isot.pickle", "wb"))
pickle.dump(rf1_clf_kdnugget, file = open("rf1_clf_kdnugget.pickle", "wb"))
    

In [30]:
isot_X_train_tf, isot_X_test_tf, isot_y_train_tf, isot_y_test_tf = train_test_split(isot_X_tf, isot_y, test_size=0.2, shuffle=True)
kdnugget_X_train_tf, kdnugget_X_test_tf, kdnugget_y_train_tf, kdnugget_y_test_tf = train_test_split(kdnugget_X_tf, kdnugget_y, test_size=0.2, shuffle=True)


## RandomForest 2
##### isot

In [31]:
rf2_clf = RandomForestClassifier(max_depth=40, n_estimators=300).fit(isot_X_train_tf, np.array(isot_y_train_tf).astype(int))
predict_y = rf2_clf.predict(isot_X_test_tf)
eval_model(np.array(isot_y_test_tf).astype(int), predict_y)


accuracy:   0.9928730512249443
Precision:   0.9948393150363594
Recall:   0.9901937893999533
F1 score:   0.9925111163117248


##### KDnugget

In [32]:
rf2_clf_kdnugget = RandomForestClassifier(n_estimators=200).fit(kdnugget_X_train_tf, np.array(kdnugget_y_train_tf).astype(int))
predict_y = rf2_clf_kdnugget.predict(kdnugget_X_test_tf)
eval_model(np.array(kdnugget_y_test_tf).astype(int), predict_y)


accuracy:   0.8955386289445049
Precision:   0.9029345372460497
Recall:   0.8830022075055187
F1 score:   0.8928571428571428


In [33]:
pickle.dump(rf2_clf, file = open("rf2_clf_isot.pickle", "wb"))
pickle.dump(rf2_clf_kdnugget, file = open("rf2_clf_kdnugget.pickle", "wb"))
        