# Predicting News Reliablity Using News Content

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm, naive_bayes, tree, ensemble, neighbors, linear_model
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
# from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, f1_score, precision_score, recall_score
import itertools
from IPython.display import display
from sklearn.dummy import DummyClassifier

def plot_cm(cm, classes, path,
                normalize=True,
                cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    # plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)
 
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
 
    plt.yticks(rotation=90, verticalalignment='center')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(path, bbox_inches='tight')
    plt.show()

SMALL_SIZE = 16
MEDIUM_SIZE = 18
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) 

CONTENT_FEATURE_PATH = "feature/content-features.csv"
LABEL_PATH = "feature/labels.csv"
CLASSES = ['is_unreliable', 'is_reliable']
RSEED = 524



Data Preprocessing

In [None]:
def date_prep(fea_path, lab_path):
    # Load data
    fea_data = pd.read_csv(fea_path)  
    lab_data = pd.read_csv(lab_path) 
    
    features = pd.DataFrame(fea_data)  
    labels = pd.DataFrame(lab_data)
    
    # Feature standardization
    print(np.array(features)[:5,:5])
    fea_scale = preprocessing.scale(features)
    print(fea_scale[:5,:5])
    
    # Read feature names
    fea_names = features.columns
    print(fea_names[:5])
    
    # Divide the overall dataset as training data and testing data (0.8:0.2)
    x_train, x_test, y_train, y_test = train_test_split(fea_scale, labels, test_size=0.2, random_state=RSEED)

    return fea_names, x_train, x_test, y_train, y_test

Logistic Regression
    

In [None]:
fea_names, x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

model = linear_model.LogisticRegression(random_state=RSEED)
model.fit(x_train, y_train)

# perm = PermutationImportance(model, random_state=RSEED).fit(x_train, y_train)
# display(eli5.show_weights(perm, feature_names = fea_names.tolist()))

y_pred = model.predict(x_test)

# cm_lg = confusion_matrix(y_test, y_pred)

print('=== Logistic Regression ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))
# print(cm_lg)
# plot_cm(cm_lg, classes,
#         path = 'confusion matrix/cm_lg.eps',
#         normalize=True,
#         cmap=plt.cm.Blues)



Naive Bayes
    

In [None]:
fea_names, x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

'''
Determining parameters:
Cross validation based on grid search

res = []
for alpha in np.arange(0.1,1,0.1):
    for binarize in np.arange(0,1,0.1):
        model = naive_bayes.BernoulliNB(alpha=alpha, binarize=binarize)
        cv_results = cross_val_score(model, x_train, y_train, scoring='f1', cv=10)
        res.append([alpha, binarize, cv_results.mean()])
bst_params = max(res, key=lambda re: re[-1])
bst_alpha = bst_params[0]
bst_binarize = bst_params[1]
print('bst_params: [ %.1f, %.1f ]' %(bst_alpha,bst_binarize))
'''

bst_alpha = 0.1
bst_binarize = 0
model = naive_bayes.BernoulliNB(alpha=bst_alpha, binarize=bst_binarize)    
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Naive Bayes ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

K-Nearest Neighbors
    

In [None]:
fea_names, x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

'''
Determining parameters:
Cross validation based on grid search

res = []
for n_neighbors in np.arange(1,10,1):
    model = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)
    cv_results = cross_val_score(model, x_train, y_train, scoring='f1', cv=10)
    res.append([n_neighbors, cv_results.mean()])
bst_params = max(res, key=lambda re: re[-1])
bst_n_neighbors = bst_params[0]
print('bst_params: [ %.1f ]' %(bst_n_neighbors))
'''

bst_n_neighbors = 3
model = neighbors.KNeighborsClassifier(n_neighbors=bst_n_neighbors)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== KNN ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


Random Forest

In [None]:
fea_names, x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

'''
Determining parameters:
Cross validation based on grid search

res = []
for n_estimators in np.arange(100,500,100):
    model = ensemble.RandomForestClassifier(n_estimators=n_estimators,random_state=RSEED)
    cv_results = cross_val_score(model, x_train, y_train, scoring='f1', cv=10)
    res.append([n_estimators, cv_results.mean()])
bst_params = max(res, key=lambda re: re[-1])
bst_n_estimators = bst_params[0]
print('bst_params: [ %.1f ]' %(bst_n_estimators))
'''

bst_n_estimators = 300
model = ensemble.RandomForestClassifier(n_estimators=bst_n_estimators)
model.fit(x_train, y_train)

importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(5,6))
plt.barh(range(len(indices)), importances[indices], color='steelblue', align='center')
plt.yticks(range(len(indices)), [str(fea_names[i]) for i in indices])
plt.xlabel('Relative Importance')
plt.savefig('figure/fea_imp_rf.eps',bbox_inches='tight')
plt.show()

y_pred = model.predict(x_test)

print('=== Random Forest ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


Decision Tree

In [None]:
fea_names, x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

model = tree.DecisionTreeClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Decision Tree ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


SVM

In [None]:
fea_names, x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

kernel = 'rbf'
max_iter = 2000
model = svm.SVC(C=0.6, kernel=kernel, max_iter=max_iter, tol=0.01, random_state=RSEED)
model.fit(x_train, y_train)

importances = permutation_importance(model, x_train, y_train)['importances_mean']
indices = np.argsort(importances)
plt.figure(figsize=(5,6))
plt.barh(range(len(indices)), importances[indices], color='steelblue', align='center')
plt.yticks(range(len(indices)), [str(fea_names[i]) for i in indices])
plt.xlabel('Relative Importance')
plt.savefig('feature importance/fea_imp_svm.eps',bbox_inches='tight')
plt.show()

y_pred = model.predict(x_test)

print('=== SVM ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

XGBoost

In [None]:
fea_names, x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

model = XGBClassifier(max_depth=5, subsample=0.6, reg_lambda=0.6, seed=RSEED)
model.fit(x_train, y_train)

importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(5,6))
plt.barh(range(len(indices)), importances[indices], color='steelblue', align='center')
plt.yticks(range(len(indices)), [str(fea_names[i]) for i in indices])
plt.xlabel('Relative Importance')
plt.savefig('figure/fea_imp_xgb.eps',bbox_inches='tight')
plt.show()

y_pred = model.predict(x_test)

print('=== XGBoost ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

Performance of Random Classifier

In [None]:
fea_names, x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

model = DummyClassifier(strategy="uniform")
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
cm_rand = confusion_matrix(y_test, y_pred)

print('=== Random ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

