### classify fake news articles

In [35]:
# imports
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from bayes_opt import BayesianOptimization
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, cross_val_score
from scipy.sparse import vstack

In [142]:
#Read the data
df=pd.read_csv('news.csv')
#Get shape and head
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [143]:
df.shape

(6335, 4)

In [144]:
#Get the labels
labels=df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [145]:
#Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2)

In [146]:
#Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
#Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [147]:
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
#Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 94.24%


In [148]:
#Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[588,  36],
       [ 37, 606]], dtype=int64)

In [124]:
TP =0
TN = 0
FP = 0
FN = 0
for i in range(len(y_pred)):
    if y_test.iloc[i] == 'FAKE' and y_pred[i] == 'FAKE':
        TN += 1
    elif y_test.iloc[i] == 'FAKE' and y_pred[i] == 'REAL':
        FP += 1
    elif y_test.iloc[i] == 'REAL' and y_pred[i] == 'FAKE':
        FN += 1
    elif y_test.iloc[i] == 'REAL' and y_pred[i] == 'REAL':
        TP += 1
print(f'TP: {TP}, TN: {TN}, FP: {FP}, FN: {FN}')

TP: 592, TN: 606, FP: 35, FN: 34


the rows are the observations and the columns are predictions. meaning that the top left corner would be the TN, the bottom right TP, the bottom left FN, and the top right FP

Using the same dataset, attempt classification using xgboost:

In [149]:
# convert the classes from text to binary
y_train_binary = [0 if i == 'FAKE' else 1 for i in y_train]
y_test_binary = [0 if i == 'FAKE' else 1 for i in y_test]

In [150]:
# combining train and test data
y_all_binary = y_train_binary + y_test_binary
tfidf_all = vstack((tfidf_train, tfidf_test))

In [67]:
# cross validation function for xgboost classifier, estimates the results of the classifier and calculates the mean cross validation score
def xgboost_cv(n_estimators, max_depth, gamma, subsample, data, targets):
    estimator = XGBClassifier(
        n_estimators=n_estimators,
        max_depth = max_depth,
        gamma = gamma,
        # min_child_weight=min_child_weight,
        subsample = subsample,
        random_state = 2,
    )
    cval = cross_val_score(estimator, data, targets, cv=5)
    return cval.mean()

In [68]:
# using bayesian optimizer, calculate the cross validation for different hyperparameter values, return the best values

# the black box function here is xgboost_crossval, and it is defined inside the optimize_xgboost function since it needs to use
# the data that is given to optimize_xgboost and the data has to be in the xgboost_crossval's scope
def optimize_xgboost(data, targets):
    def xgboost_crossval(n_estimators, max_depth, gamma, subsample):
        return xgboost_cv(
            n_estimators=int(n_estimators),
            max_depth = int(max_depth),
            gamma = gamma,
            # min_child_weight=min_child_weight,
            subsample=subsample,
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=xgboost_crossval,
        pbounds={
            "n_estimators": (100, 500),
            "max_depth": (6,15),
            "gamma": (0,10),
            # "min_child_weight": (0,10),
            "subsample": (0.8,1.0)
        },
        verbose=2
    )
    optimizer.maximize(n_iter=25, init_points=10)

    print("Final result:", optimizer.max)

In [136]:
# get the optimized hyperparameters for xgboost
print("--- Optimizing XGBoost ---")
# optimize_xgboost(tfidf_all, y_all_binary)
optimize_xgboost(tfidf_train, y_train_binary)

--- Optimizing XGBoost ---
|   iter    |  target   |   gamma   | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.5008   [0m | [0m1.75     [0m | [0m6.557    [0m | [0m426.1    [0m | [0m0.9976   [0m |
| [95m2        [0m | [95m0.5069   [0m | [95m2.099    [0m | [95m12.97    [0m | [95m204.8    [0m | [95m0.9605   [0m |
| [95m3        [0m | [95m0.5122   [0m | [95m3.879    [0m | [95m12.4     [0m | [95m390.7    [0m | [95m0.857    [0m |
| [0m4        [0m | [0m0.5022   [0m | [0m1.474    [0m | [0m11.24    [0m | [0m269.1    [0m | [0m0.8392   [0m |
| [0m5        [0m | [0m0.4988   [0m | [0m7.295    [0m | [0m6.172    [0m | [0m214.6    [0m | [0m0.9596   [0m |
| [0m6        [0m | [0m0.5012   [0m | [0m7.208    [0m | [0m6.359    [0m | [0m243.5    [0m | [0m0.9674   [0m |
| [0m7        [0m | [0m0.503    [0m | [0m2.955    [0m | [0m12.46    [0m |

KeyboardInterrupt: 

In [151]:
# using the optimized hyperparameters, run the xgbclassifier
# xg = XGBClassifier(gamma=0.21, max_depth=12, n_estimators=325, subsample=1)
xg = XGBClassifier(gamma=0, max_depth=6, n_estimators=158, subsample=1)

In [152]:
# fit the training data to the xgboost classifier
xg.fit(tfidf_train, y_train_binary)

In [153]:
# predict results for test data
y_pred = xg.predict(tfidf_test)

In [154]:
# get the accuracy score for test predictions
score=accuracy_score(y_test_binary,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.05%


In [155]:
#Build confusion matrix
confusion_matrix(y_test_binary,y_pred, labels=[0, 1])

array([[587,  37],
       [ 51, 592]], dtype=int64)

### classify spam text messages

using the same techniques as above, classify text messages as 'spam' or 'ham'

In [53]:
#Read the data
spam_df=pd.read_csv('spam.csv')
#Get shape and head
spam_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [54]:
spam_df.shape

(5572, 2)

In [56]:
#Get the labels
spam_labels=spam_df.Category
spam_labels.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: Category, dtype: object

In [58]:
#Split the dataset
spam_x_train,spam_x_test,spam_y_train,spam_y_test=train_test_split(spam_df['Message'], spam_labels, test_size=0.2)

In [59]:
#Initialize a TfidfVectorizer
spam_tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
#Fit and transform train set, transform test set
spam_tfidf_train=spam_tfidf_vectorizer.fit_transform(spam_x_train) 
spam_tfidf_test=spam_tfidf_vectorizer.transform(spam_x_test)

In [60]:
#Initialize a PassiveAggressiveClassifier
spam_pac=PassiveAggressiveClassifier(max_iter=50)
spam_pac.fit(spam_tfidf_train,spam_y_train)
#Predict on the test set and calculate accuracy
spam_y_pred=spam_pac.predict(spam_tfidf_test)
spam_score=accuracy_score(spam_y_test,spam_y_pred)
print(f'Accuracy: {round(spam_score*100,2)}%')

Accuracy: 98.48%


In [65]:
#Build confusion matrix
confusion_matrix(spam_y_test,spam_y_pred, labels=['spam','ham'])

array([[146,  13],
       [  4, 952]], dtype=int64)

classification of spam using xgboost:

In [66]:
# convert the classes from text to binary
spam_y_train_binary = [0 if i == 'spam' else 1 for i in spam_y_train]
spam_y_test_binary = [0 if i == 'spam' else 1 for i in spam_y_test]

In [71]:
# combining train and test data
spam_y_all_binary = spam_y_train_binary + spam_y_test_binary
spam_tfidf_all = vstack((spam_tfidf_train, spam_tfidf_test))

In [130]:
# get the optimized hyperparameters for xgboost
print("--- Optimizing XGBoost ---")
# optimize_xgboost(spam_tfidf_all, spam_y_all_binary)
optimize_xgboost(spam_tfidf_train, spam_y_train_binary)

--- Optimizing XGBoost ---
|   iter    |  target   |   gamma   | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.9632   [0m | [0m7.766    [0m | [0m10.87    [0m | [0m445.8    [0m | [0m0.8885   [0m |
| [95m2        [0m | [95m0.9729   [0m | [95m0.8967   [0m | [95m9.072    [0m | [95m199.7    [0m | [95m0.8071   [0m |
| [0m3        [0m | [0m0.9625   [0m | [0m8.498    [0m | [0m11.14    [0m | [0m187.8    [0m | [0m0.9007   [0m |
| [0m4        [0m | [0m0.9598   [0m | [0m8.761    [0m | [0m7.114    [0m | [0m457.2    [0m | [0m0.9123   [0m |
| [0m5        [0m | [0m0.9666   [0m | [0m4.898    [0m | [0m7.241    [0m | [0m437.5    [0m | [0m0.9566   [0m |
| [95m6        [0m | [95m0.9738   [0m | [95m0.608    [0m | [95m8.123    [0m | [95m246.4    [0m | [95m0.8747   [0m |
| [0m7        [0m | [0m0.9688   [0m | [0m4.174    [0m | [0m14.47    [0m |

In [131]:
# using the optimized hyperparameters, run the xgbclassifier
# spam_xg = XGBClassifier(gamma=0, max_depth=6, n_estimators=202, subsample=1)
spam_xg = XGBClassifier(gamma=0, max_depth=7, n_estimators=254, subsample=1)

In [132]:
# fit the training data to the xgboost classifier
spam_xg.fit(spam_tfidf_train, spam_y_train_binary)

In [133]:
# predict results for test data
spam_y_pred = spam_xg.predict(spam_tfidf_test)

In [134]:
# get the accuracy score for test predictions
spam_score=accuracy_score(spam_y_test_binary,spam_y_pred)
print(f'Accuracy: {round(spam_score*100,2)}%')

Accuracy: 97.49%


In [135]:
#Build confusion matrix
confusion_matrix(spam_y_test_binary,spam_y_pred, labels=[0, 1])

array([[134,  25],
       [  3, 953]], dtype=int64)