# Predicting News Reliablity Using News Attributes

In [72]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm, naive_bayes, tree, ensemble, neighbors, linear_model
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, f1_score, precision_score, recall_score
from sklearn.dummy import DummyClassifier

CONTENT_FEATURE_PATH = "../feature/content-features.csv"
LABEL_PATH = "../feature/labels.csv"
CLASSES = ['is_reliable', 'is_unreliable']
RSEED = 46

Data Preprocessing

In [73]:
def date_prep(fea_path, lab_path):
    # Load data
    fea_data = pd.read_csv(fea_path)  
    lab_data = pd.read_csv(lab_path) 
    
    features = pd.DataFrame(fea_data)  
    labels = pd.DataFrame(lab_data)
    
    # Feature standardization
    fea_scale = preprocessing.scale(features)
    
    # Divide the overall dataset as training data and testing data (0.8:0.2)
    x_train, x_test, y_train, y_test = train_test_split(fea_scale, labels, test_size=0.2, random_state=RSEED)

    return x_train, x_test, y_train, y_test

Logistic Regression
    

In [74]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

model = linear_model.LogisticRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Logistic Regression ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

=== Logistic Regression ===
               precision    recall  f1-score   support

  is_reliable       0.78      0.80      0.79       269
is_unreliable       0.58      0.56      0.57       137

     accuracy                           0.72       406
    macro avg       0.68      0.68      0.68       406
 weighted avg       0.71      0.72      0.72       406



  y = column_or_1d(y, warn=True)


Naive Bayes
    

In [75]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

model = naive_bayes.BernoulliNB() 
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Naive Bayes ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

=== Naive Bayes ===
               precision    recall  f1-score   support

  is_reliable       0.78      0.78      0.78       269
is_unreliable       0.57      0.56      0.57       137

     accuracy                           0.71       406
    macro avg       0.67      0.67      0.67       406
 weighted avg       0.71      0.71      0.71       406



  y = column_or_1d(y, warn=True)


K-Nearest Neighbors
    

In [76]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

model = neighbors.KNeighborsClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== KNN ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


=== KNN ===
               precision    recall  f1-score   support

  is_reliable       0.77      0.83      0.80       269
is_unreliable       0.60      0.50      0.55       137

     accuracy                           0.72       406
    macro avg       0.68      0.67      0.67       406
 weighted avg       0.71      0.72      0.71       406



  model.fit(x_train, y_train)


Random Forest

In [77]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

model = ensemble.RandomForestClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Random Forest ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


  model.fit(x_train, y_train)


=== Random Forest ===
               precision    recall  f1-score   support

  is_reliable       0.88      0.93      0.90       269
is_unreliable       0.84      0.74      0.79       137

     accuracy                           0.87       406
    macro avg       0.86      0.84      0.85       406
 weighted avg       0.87      0.87      0.86       406



Decision Tree

In [78]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

model = tree.DecisionTreeClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Decision Tree ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


=== Decision Tree ===
               precision    recall  f1-score   support

  is_reliable       0.87      0.84      0.86       269
is_unreliable       0.71      0.76      0.73       137

     accuracy                           0.82       406
    macro avg       0.79      0.80      0.80       406
 weighted avg       0.82      0.82      0.82       406





SVM

In [79]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

model = svm.SVC()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== SVM ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

=== SVM ===
               precision    recall  f1-score   support

  is_reliable       0.78      0.81      0.79       269
is_unreliable       0.59      0.55      0.57       137

     accuracy                           0.72       406
    macro avg       0.68      0.68      0.68       406
 weighted avg       0.71      0.72      0.72       406



  y = column_or_1d(y, warn=True)


XGBoost

In [80]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

model = XGBClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== XGBoost ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

=== XGBoost ===
               precision    recall  f1-score   support

  is_reliable       0.86      0.95      0.90       269
is_unreliable       0.87      0.71      0.78       137

     accuracy                           0.87       406
    macro avg       0.87      0.83      0.84       406
 weighted avg       0.87      0.87      0.86       406



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Performance of Random Classifier

In [81]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, LABEL_PATH)

model = DummyClassifier(strategy="uniform")
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Random ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


=== Random ===
               precision    recall  f1-score   support

  is_reliable       0.67      0.52      0.58       269
is_unreliable       0.34      0.49      0.40       137

     accuracy                           0.51       406
    macro avg       0.50      0.50      0.49       406
 weighted avg       0.56      0.51      0.52       406



