# Predicting News Reliablity Using LIWC


In [82]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm, naive_bayes, tree, ensemble, neighbors, linear_model
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, f1_score, precision_score, recall_score

CSV_FILE_DIR_HEAD = "/Volumes/MySSD/PycharmProjects/MIS-COV19/"
CONTENT_FEATURE_PATH1 = CSV_FILE_DIR_HEAD+"feature/liwc_title.csv"
CONTENT_FEATURE_PATH2 = CSV_FILE_DIR_HEAD+"feature/liwc_body.csv"
LABEL_PATH = CSV_FILE_DIR_HEAD+"feature/labels.csv"
CLASSES = ['is_reliable', 'is_unreliable']
RSEED = 46

Data Preprocessing

In [83]:
def date_prep(fea_path1, fea_path2, lab_path):
    # Load data
    fea_data1 = pd.read_csv(fea_path1)  
    fea_data2 = pd.read_csv(fea_path2)  
    lab_data = pd.read_csv(lab_path) 
    
    features = np.array(fea_data1)  
    features += np.array(fea_data2)
    labels = np.array(lab_data)
    
    # Feature standardization
    fea_scale = preprocessing.scale(features)
    
    # Divide the overall dataset as training data and testing data (0.8:0.2)
    x_train, x_test, y_train, y_test = train_test_split(fea_scale, labels, test_size=0.2, random_state=RSEED)

    return x_train, x_test, y_train, y_test


Logistic Regression
    

In [84]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH1, CONTENT_FEATURE_PATH2, LABEL_PATH)

model = linear_model.LogisticRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Logistic Regression ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

=== Logistic Regression ===
               precision    recall  f1-score   support

  is_reliable       0.77      0.86      0.82       269
is_unreliable       0.65      0.50      0.57       137

     accuracy                           0.74       406
    macro avg       0.71      0.68      0.69       406
 weighted avg       0.73      0.74      0.73       406



  y = column_or_1d(y, warn=True)


Naive Bayes
    

In [85]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH1, CONTENT_FEATURE_PATH2, LABEL_PATH)

model = naive_bayes.BernoulliNB() 
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Naive Bayes ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

=== Naive Bayes ===
               precision    recall  f1-score   support

  is_reliable       0.79      0.75      0.77       269
is_unreliable       0.55      0.61      0.58       137

     accuracy                           0.70       406
    macro avg       0.67      0.68      0.67       406
 weighted avg       0.71      0.70      0.71       406



  y = column_or_1d(y, warn=True)


K-Nearest Neighbors
    

In [86]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH1, CONTENT_FEATURE_PATH2, LABEL_PATH)

model = neighbors.KNeighborsClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== KNN ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


  model.fit(x_train, y_train)


=== KNN ===
               precision    recall  f1-score   support

  is_reliable       0.69      0.82      0.75       269
is_unreliable       0.44      0.28      0.34       137

     accuracy                           0.64       406
    macro avg       0.56      0.55      0.54       406
 weighted avg       0.60      0.64      0.61       406



Random Forest

In [87]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH1, CONTENT_FEATURE_PATH2, LABEL_PATH)

model = ensemble.RandomForestClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Random Forest ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


  model.fit(x_train, y_train)


=== Random Forest ===
               precision    recall  f1-score   support

  is_reliable       0.78      0.97      0.87       269
is_unreliable       0.88      0.47      0.62       137

     accuracy                           0.80       406
    macro avg       0.83      0.72      0.74       406
 weighted avg       0.82      0.80      0.78       406



Decision Tree

In [88]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH1, CONTENT_FEATURE_PATH2, LABEL_PATH)

model = tree.DecisionTreeClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== Decision Tree ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))


=== Decision Tree ===
               precision    recall  f1-score   support

  is_reliable       0.76      0.76      0.76       269
is_unreliable       0.53      0.53      0.53       137

     accuracy                           0.68       406
    macro avg       0.64      0.64      0.64       406
 weighted avg       0.68      0.68      0.68       406



SVM

In [89]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH1, CONTENT_FEATURE_PATH2, LABEL_PATH)

model = svm.SVC()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== SVM ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

  y = column_or_1d(y, warn=True)


=== SVM ===
               precision    recall  f1-score   support

  is_reliable       0.75      0.94      0.84       269
is_unreliable       0.78      0.38      0.51       137

     accuracy                           0.75       406
    macro avg       0.76      0.66      0.67       406
 weighted avg       0.76      0.75      0.73       406



XGBoost

In [90]:
x_train, x_test, y_train, y_test = date_prep(CONTENT_FEATURE_PATH1, CONTENT_FEATURE_PATH2, LABEL_PATH)

model = XGBClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print('=== XGBoost ===')
print(classification_report(y_test, y_pred, target_names=CLASSES))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


=== XGBoost ===
               precision    recall  f1-score   support

  is_reliable       0.83      0.95      0.89       269
is_unreliable       0.86      0.62      0.72       137

     accuracy                           0.84       406
    macro avg       0.84      0.78      0.80       406
 weighted avg       0.84      0.84      0.83       406

