# Predicting News Reliablity Using All Attributes

In [44]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm, naive_bayes, tree, ensemble, neighbors, linear_model
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, f1_score, precision_score, recall_score
from sklearn.dummy import DummyClassifier

CONTENT_FEATURE_PATH = "../feature/content-features.csv"
SOCIAL_FEATURE_PATH = "../feature/social-features.csv"
LABEL_PATH = "../feature/labels.csv"
CLASSES = ['is_reliable', 'is_unreliable']
RSEED = 46

Data Preprocessing

In [45]:
def date_prep(fea_path1, fea_path2, lab_path):
    # Load data
    fea_data1 = pd.read_csv(fea_path1)  
    fea_data2 = pd.read_csv(fea_path2)  
    lab_data = pd.read_csv(lab_path) 
    
    features = np.array(np.hstack((fea_data1,fea_data2)))
    labels = np.array(lab_data)
    
    # Feature standardization
    fea_scale = preprocessing.scale(features)
    
    # Divide the overall dataset as training data and testing data (0.8:0.2)
    x_train, x_test, y_train, y_test = train_test_split(fea_scale, labels, test_size=0.2, random_state=RSEED)

    return x_train, x_test, y_train, y_test

Logistic Regression
    

In [46]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, SOCIAL_FEATURE_PATH, LABEL_PATH)

model = linear_model.LogisticRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Logistic Regression ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

=== Logistic Regression ===
               precision    recall  f1-score   support

  is_reliable       0.79      0.84      0.82       269
is_unreliable       0.64      0.55      0.60       137

     accuracy                           0.75       406
    macro avg       0.72      0.70      0.71       406
 weighted avg       0.74      0.75      0.74       406



  y = column_or_1d(y, warn=True)


Naive Bayes
    

In [47]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, SOCIAL_FEATURE_PATH, LABEL_PATH)

model = naive_bayes.BernoulliNB() 
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Naive Bayes ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

=== Naive Bayes ===
               precision    recall  f1-score   support

  is_reliable       0.81      0.75      0.78       269
is_unreliable       0.57      0.65      0.61       137

     accuracy                           0.72       406
    macro avg       0.69      0.70      0.70       406
 weighted avg       0.73      0.72      0.72       406



  y = column_or_1d(y, warn=True)


K-Nearest Neighbors
    

In [48]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, SOCIAL_FEATURE_PATH, LABEL_PATH)

model = neighbors.KNeighborsClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== KNN ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


=== KNN ===
               precision    recall  f1-score   support

  is_reliable       0.84      0.86      0.85       269
is_unreliable       0.71      0.69      0.70       137

     accuracy                           0.80       406
    macro avg       0.77      0.77      0.77       406
 weighted avg       0.80      0.80      0.80       406



  model.fit(x_train, y_train)


Random Forest

In [49]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, SOCIAL_FEATURE_PATH, LABEL_PATH)

model = ensemble.RandomForestClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Random Forest ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


  model.fit(x_train, y_train)


=== Random Forest ===
               precision    recall  f1-score   support

  is_reliable       0.89      0.93      0.91       269
is_unreliable       0.85      0.77      0.80       137

     accuracy                           0.87       406
    macro avg       0.87      0.85      0.86       406
 weighted avg       0.87      0.87      0.87       406



Decision Tree

In [50]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, SOCIAL_FEATURE_PATH, LABEL_PATH)

model = tree.DecisionTreeClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Decision Tree ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


=== Decision Tree ===
               precision    recall  f1-score   support

  is_reliable       0.91      0.89      0.90       269
is_unreliable       0.79      0.83      0.81       137

     accuracy                           0.87       406
    macro avg       0.85      0.86      0.86       406
 weighted avg       0.87      0.87      0.87       406





SVM

In [51]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, SOCIAL_FEATURE_PATH, LABEL_PATH)

model = svm.SVC()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== SVM ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

=== SVM ===
               precision    recall  f1-score   support

  is_reliable       0.83      0.82      0.82       269
is_unreliable       0.65      0.66      0.66       137

     accuracy                           0.77       406
    macro avg       0.74      0.74      0.74       406
 weighted avg       0.77      0.77      0.77       406



  y = column_or_1d(y, warn=True)


XGBoost

In [52]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, SOCIAL_FEATURE_PATH, LABEL_PATH)

model = XGBClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== XGBoost ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


=== XGBoost ===
               precision    recall  f1-score   support

  is_reliable       0.89      0.93      0.91       269
is_unreliable       0.86      0.78      0.82       137

     accuracy                           0.88       406
    macro avg       0.87      0.86      0.86       406
 weighted avg       0.88      0.88      0.88       406



Performance of Random Classifier

In [53]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH, SOCIAL_FEATURE_PATH, LABEL_PATH)

model = DummyClassifier(strategy="uniform")
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Random ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))




=== Random ===
               precision    recall  f1-score   support

  is_reliable       0.64      0.45      0.53       269
is_unreliable       0.32      0.50      0.39       137

     accuracy                           0.47       406
    macro avg       0.48      0.47      0.46       406
 weighted avg       0.53      0.47      0.48       406



