# Predicting News Reliablity Using News Content

In [91]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm, naive_bayes, tree, ensemble, neighbors, linear_model
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, f1_score, precision_score, recall_score

CONTENT_FEATURE_PATH = "../feature/content-features.csv"
LIWC_FEATURE_PATH1 = "../feature/liwc_title.csv"
LIWC_FEATURE_PATH2 = "../feature/liwc_body.csv"
LABEL_PATH = "../feature/labels.csv"
CLASSES = ['is_reliable', 'is_unreliable']
RSEED = 46


Data Preprocessing

In [92]:
def date_prep(fea_path1, fea_path2, fea_path3, lab_path):
    # Load data
    fea_data1 = pd.read_csv(fea_path1)  
    fea_data2 = pd.read_csv(fea_path2)
    fea_data3 = pd.read_csv(fea_path3)
    lab_data = pd.read_csv(lab_path) 
    
      
    features = np.array(fea_data2)
    features += np.array(fea_data3)
    features = np.array(np.hstack((fea_data1,features)))
    labels = np.array(lab_data)
    
    # Feature standardization
    fea_scale = preprocessing.scale(features)
    
    # Divide the overall dataset as training data and testing data (0.8:0.2)
    x_train, x_test, y_train, y_test = train_test_split(fea_scale, labels, test_size=0.2, random_state=RSEED)

    return x_train, x_test, y_train, y_test

Logistic Regression
    

In [93]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LIWC_FEATURE_PATH1, LIWC_FEATURE_PATH2, LABEL_PATH)

model = linear_model.LogisticRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Logistic Regression ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

=== Logistic Regression ===
               precision    recall  f1-score   support

  is_reliable       0.82      0.89      0.86       269
is_unreliable       0.74      0.63      0.68       137

     accuracy                           0.80       406
    macro avg       0.78      0.76      0.77       406
 weighted avg       0.80      0.80      0.80       406



  y = column_or_1d(y, warn=True)


Naive Bayes
    

In [94]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LIWC_FEATURE_PATH1, LIWC_FEATURE_PATH2, LABEL_PATH)

model = naive_bayes.BernoulliNB() 
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Naive Bayes ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

=== Naive Bayes ===
               precision    recall  f1-score   support

  is_reliable       0.82      0.79      0.81       269
is_unreliable       0.62      0.67      0.64       137

     accuracy                           0.75       406
    macro avg       0.72      0.73      0.72       406
 weighted avg       0.75      0.75      0.75       406



  y = column_or_1d(y, warn=True)


K-Nearest Neighbors
    

In [95]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LIWC_FEATURE_PATH1, LIWC_FEATURE_PATH2, LABEL_PATH)

model = neighbors.KNeighborsClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== KNN ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


  model.fit(x_train, y_train)


=== KNN ===
               precision    recall  f1-score   support

  is_reliable       0.73      0.86      0.79       269
is_unreliable       0.57      0.37      0.45       137

     accuracy                           0.69       406
    macro avg       0.65      0.62      0.62       406
 weighted avg       0.68      0.69      0.67       406



Random Forest

In [96]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LIWC_FEATURE_PATH1, LIWC_FEATURE_PATH2, LABEL_PATH)

model = ensemble.RandomForestClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Random Forest ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


  model.fit(x_train, y_train)


=== Random Forest ===
               precision    recall  f1-score   support

  is_reliable       0.91      0.97      0.94       269
is_unreliable       0.94      0.81      0.87       137

     accuracy                           0.92       406
    macro avg       0.93      0.89      0.91       406
 weighted avg       0.92      0.92      0.92       406



Decision Tree

In [97]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LIWC_FEATURE_PATH1, LIWC_FEATURE_PATH2, LABEL_PATH)

model = tree.DecisionTreeClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Decision Tree ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))




=== Decision Tree ===
               precision    recall  f1-score   support

  is_reliable       0.86      0.90      0.88       269
is_unreliable       0.79      0.72      0.75       137

     accuracy                           0.84       406
    macro avg       0.82      0.81      0.82       406
 weighted avg       0.84      0.84      0.84       406



SVM

In [98]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LIWC_FEATURE_PATH1, LIWC_FEATURE_PATH2, LABEL_PATH)

model = svm.SVC()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== SVM ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

  y = column_or_1d(y, warn=True)


=== SVM ===
               precision    recall  f1-score   support

  is_reliable       0.78      0.94      0.85       269
is_unreliable       0.81      0.47      0.60       137

     accuracy                           0.79       406
    macro avg       0.80      0.71      0.73       406
 weighted avg       0.79      0.79      0.77       406



XGBoost

In [99]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LIWC_FEATURE_PATH1, LIWC_FEATURE_PATH2, LABEL_PATH)

model = XGBClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== XGBoost ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


=== XGBoost ===
               precision    recall  f1-score   support

  is_reliable       0.91      0.97      0.94       269
is_unreliable       0.94      0.82      0.87       137

     accuracy                           0.92       406
    macro avg       0.93      0.90      0.91       406
 weighted avg       0.92      0.92      0.92       406

