In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import KFold

from scipy import stats
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.metrics import f1_score, classification_report

## Importing data

In [2]:
df = pd.read_csv("../data/preprocessed_cleaned_body.csv")

In [3]:
df.head()

Unnamed: 0,Body,Bias,cleaned_body
0,Abortion rights advocates have asked the U.S. ...,1.67,abortion right advocate asked u supreme court ...
1,A federal appeals court rejected the most dire...,0.67,federal appeal court rejected direct constitut...
2,As part of the Trump administration's effort t...,-2.75,part trump administration effort slow migrant ...
3,"President Donald Trump and ""the Trump of the T...",-4.33,president donald trump trump tropic brazilian ...
4,"U.S Senator Elizabeth Warren, who is competing...",-10.0,u senator elizabeth warren competing democrati...


## First Model - Naive Bayes

In [4]:
# 1 represents most left, 3 represents most right
bins = [-41, -5, 5, 41]
names = ['1', '2', '3']

multi_df = df.loc[:]
multi_df['Category'] = pd.cut(multi_df['Bias'], bins, labels=names)

In [5]:
tfidf_transformer = TfidfVectorizer(max_features = 800)
tfidf = tfidf_transformer.fit_transform(multi_df['cleaned_body'])

X = pd.DataFrame(tfidf.toarray(), columns=tfidf_transformer.get_feature_names())
y = multi_df['Category']

In [6]:
col_names = X.columns

scaler = MinMaxScaler()
scaled = scaler.fit_transform(X)
X = pd.DataFrame(scaled, columns=col_names)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [8]:
# Defining Model
mnb = MultinomialNB()
# Training Model
mnb.fit(X_train, y_train)
# Making Predictions
y_pred = mnb.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred))

Accuracy Score:  0.5775656324582339


In [9]:
y_probs = mnb.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.7642936349523176


## Second Model - Logistic Regression

In [10]:
# Defining Model
regressor = LogisticRegression()
# Training Model
regressor.fit(X_train, y_train)
# Making Predictions
y_pred2 = regressor.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred2))

Accuracy Score:  0.60381861575179


In [11]:
y_probs = regressor.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.7748667274504489


## Third model - SVM

In [12]:
# Defining Model
clf = SVC(probability=True)
# Training Model
clf.fit(X_train, y_train)
# Making Predictions
y_pred3 = clf.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred3))

Accuracy Score:  0.6181384248210023


In [13]:
y_probs = clf.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.8008702074588819


## Ensemble method

In [14]:
results = pd.DataFrame({'pred1': y_pred,
                        'pred2': y_pred2,
                        'pred3': y_pred3})

In [15]:
results['final'] = results.mode(axis=1)[0]
results.head()

Unnamed: 0,pred1,pred2,pred3,final
0,2,2,2,2
1,1,3,3,3
2,1,1,1,1
3,2,2,2,2
4,2,2,2,2


In [16]:
final_pred = results['final']
print("Accuracy Score: ", accuracy_score(y_test, final_pred))

Accuracy Score:  0.6229116945107399


In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.53      0.62      0.57       162
           2       0.62      0.66      0.64       167
           3       0.59      0.36      0.44        90

    accuracy                           0.58       419
   macro avg       0.58      0.54      0.55       419
weighted avg       0.58      0.58      0.57       419



In [18]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           1       0.59      0.65      0.62       162
           2       0.64      0.67      0.65       167
           3       0.56      0.39      0.46        90

    accuracy                           0.60       419
   macro avg       0.59      0.57      0.58       419
weighted avg       0.60      0.60      0.60       419



In [19]:
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           1       0.56      0.70      0.62       162
           2       0.66      0.73      0.70       167
           3       0.71      0.27      0.39        90

    accuracy                           0.62       419
   macro avg       0.64      0.56      0.57       419
weighted avg       0.63      0.62      0.60       419



In [20]:
print(classification_report(y_test, final_pred))

              precision    recall  f1-score   support

           1       0.57      0.70      0.63       162
           2       0.67      0.70      0.68       167
           3       0.66      0.34      0.45        90

    accuracy                           0.62       419
   macro avg       0.63      0.58      0.59       419
weighted avg       0.63      0.62      0.61       419



## Using VotingClassifier sklearn

In [21]:
estimators = [('svm', clf), 
              ('naivebayes', mnb), 
              ('logistic', regressor)]

ensemble = VotingClassifier(estimators, voting='soft')

kfold = KFold(n_splits=5)

results = cross_val_score(ensemble, X, y, cv=kfold, scoring='accuracy')

In [22]:
results.mean()

0.5773134328358209

In [23]:
estimators = [('svm', clf), 
              ('naivebayes', mnb), 
              ('logistic', regressor)]

# Defining Model
ensemble = VotingClassifier(estimators, voting='soft')
# Training Model
ensemble = ensemble.fit(X_train, y_train)
# Making Predictions
y_pred4 = ensemble.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred4))

Accuracy Score:  0.630071599045346


In [24]:
y_probs = ensemble.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.7954219818251015


### Using Different Data Features

### Header and number of links 

In [25]:
df_total_data = pd.read_csv('../data/final_data.csv')
df_total_data.head()

Unnamed: 0,Url,Author,Date,Header,Body,n_links,Source,Bias,Quality
0,https://abcnews.go.com/Politics/abortion-right...,Devin Dwyer,"Wed, 17 Apr 2019 10:14:00 GMT",Abortion rights group asks Supreme Court to st...,Abortion rights advocates have asked the U.S. ...,3.0,ABC,1.67,49.0
1,https://abcnews.go.com/Politics/appeals-court-...,Ali Dukakis,"Tue, 26 Feb 2019 09:05:00 GMT",Appeals court says special counsel Robert Muel...,A federal appeals court rejected the most dire...,2.0,ABC,0.67,51.67
2,https://abcnews.go.com/Politics/attorney-gener...,Luke Barr,"Wed, 17 Apr 2019 14:02:00 GMT",Attorney general orders some asylum seekers to...,As part of the Trump administration's effort t...,6.0,ABC,-2.75,43.5
3,https://abcnews.go.com/Politics/donald-trump-t...,Meridith McGraw,"Tue, 19 Mar 2019 12:44:00 GMT","Donald Trump and 'the Trump of the Tropics,' B...","President Donald Trump and ""the Trump of the T...",10.0,ABC,-4.33,52.67
4,https://abcnews.go.com/Politics/electoral-coll...,Matthew Dowd,"Tue, 19 Mar 2019 21:39:00 GMT",The Electoral College limits the campaign play...,"U.S Senator Elizabeth Warren, who is competing...",5.0,ABC,-10.0,32.0


In [50]:
tfidf = tfidf_transformer.fit_transform(df_total_data['Header'])
X = pd.DataFrame(tfidf.toarray(), columns=tfidf_transformer.get_feature_names())

for col in X.columns:
    if col.isalpha() == False:
        X = X.drop(col, axis=1)

X.append(df_total_data['n_links'])

#X[X.name.isalpha()]
#X = X.drop(name for name in X if name.isalpha())
X_train, X_test, y_train, y_test = train_test_split(X, y)
clf.fit(X_train, y_train)
y_pred5 = clf.predict(X_test)
print("Accuracy Score: ", accuracy_score(y_test, y_pred5))

Accuracy Score:  0.5107398568019093


In [51]:
y_probs = clf.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.6855527364987314


### number of links

multiclass


In [39]:
y_train

1384    2
230     1
1039    2
443     1
1665    3
       ..
902     1
1074    2
1024    3
363     1
10      1
Name: Category, Length: 1256, dtype: category
Categories (3, object): [1 < 2 < 3]

In [45]:
clf.fit(X_train, y_train)
y_pred6 = clf.predict(X_test)
print ("Accuracy Score: ", accuracy_score(y_test, y_pred6))

ValueError: Expected 2D array, got 1D array instead:
array=[20.  5.  0. ...  5.  2. 19.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.