In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import KFold

from scipy import stats
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.metrics import f1_score, classification_report

## Importing data

In [2]:
df = pd.read_csv("../data/preprocessed_cleaned_body.csv")

In [3]:
df.head()

Unnamed: 0,Body,Bias,cleaned_body
0,Abortion rights advocates have asked the U.S. ...,1.67,abortion right advocate asked u supreme court ...
1,A federal appeals court rejected the most dire...,0.67,federal appeal court rejected direct constitut...
2,As part of the Trump administration's effort t...,-2.75,part trump administration effort slow migrant ...
3,"President Donald Trump and ""the Trump of the T...",-4.33,president donald trump trump tropic brazilian ...
4,"U.S Senator Elizabeth Warren, who is competing...",-10.0,u senator elizabeth warren competing democrati...


## First Model - Naive Bayes

In [13]:
# 1 represents most left, 3 represents most right
bins = [-41, -5, 5, 41]
names = ['1', '2', '3']

multi_df = df.loc[:]
multi_df['Category'] = pd.cut(multi_df['Bias'], bins, labels=names)

In [14]:
tfidf_transformer = TfidfVectorizer(max_features = 800)
tfidf = tfidf_transformer.fit_transform(multi_df['cleaned_body'])

X = pd.DataFrame(tfidf.toarray(), columns=tfidf_transformer.get_feature_names())
y = multi_df['Category']

In [15]:
col_names = X.columns

scaler = MinMaxScaler()
scaled = scaler.fit_transform(X)
X = pd.DataFrame(scaled, columns=col_names)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [17]:
# Defining Model
mnb = MultinomialNB()
# Training Model
mnb.fit(X_train, y_train)
# Making Predictions
y_pred = mnb.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred))

Accuracy Score:  0.6133651551312649


In [19]:
y_probs = mnb.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.7867350368960674


## Second Model - Logistic Regression

In [21]:
# Defining Model
regressor = LogisticRegression()
# Training Model
regressor.fit(X_train, y_train)
# Making Predictions
y_pred2 = regressor.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred2))

Accuracy Score:  0.6276849642004774


In [29]:
y_probs = regressor.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.7955378394790632


## Third model - SVM

In [31]:
# Defining Model
clf = SVC(probability=True)
# Training Model
clf.fit(X_train, y_train)
# Making Predictions
y_pred3 = clf.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred3))

Accuracy Score:  0.6205250596658711


In [32]:
y_probs = clf.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.8277007189446803


## Ensemble method

In [42]:
results = pd.DataFrame({'pred1': y_pred,
                        'pred2': y_pred2,
                        'pred3': y_pred3})

In [58]:
results['final'] = results.mode(axis=1)[0]
results.head()

Unnamed: 0,pred1,pred2,pred3,final
0,2,2,2,2
1,2,2,2,2
2,2,1,2,2
3,1,2,1,1
4,2,2,2,2


In [62]:
final_pred = results['final']
print("Accuracy Score: ", accuracy_score(y_test, final_pred))

Accuracy Score:  0.6205250596658711


In [65]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.58      0.62      0.60       162
           2       0.67      0.69      0.68       165
           3       0.57      0.47      0.51        92

    accuracy                           0.61       419
   macro avg       0.60      0.59      0.60       419
weighted avg       0.61      0.61      0.61       419



In [66]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           1       0.62      0.64      0.63       162
           2       0.65      0.67      0.66       165
           3       0.59      0.52      0.55        92

    accuracy                           0.63       419
   macro avg       0.62      0.61      0.62       419
weighted avg       0.63      0.63      0.63       419



In [67]:
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           1       0.58      0.69      0.63       162
           2       0.63      0.75      0.68       165
           3       0.78      0.27      0.40        92

    accuracy                           0.62       419
   macro avg       0.67      0.57      0.57       419
weighted avg       0.65      0.62      0.60       419



In [68]:
print(classification_report(y_test, final_pred))

              precision    recall  f1-score   support

           1       0.59      0.65      0.62       162
           2       0.65      0.73      0.69       165
           3       0.62      0.36      0.46        92

    accuracy                           0.62       419
   macro avg       0.62      0.58      0.59       419
weighted avg       0.62      0.62      0.61       419



## Using VotingClassifier sklearn

In [89]:
estimators = [('svm', clf), 
              ('naivebayes', mnb), 
              ('logistic', regressor)]

ensemble = VotingClassifier(estimators, voting='soft')

kfold = KFold(n_splits=5)

results = cross_val_score(ensemble, X, y, cv=kfold, scoring='accuracy')

In [90]:
results.mean()

0.5791044776119403

In [87]:
estimators = [('svm', clf), 
              ('naivebayes', mnb), 
              ('logistic', regressor)]

# Defining Model
ensemble = VotingClassifier(estimators, voting='soft')
# Training Model
ensemble = ensemble.fit(X_train, y_train)
# Making Predictions
y_pred4 = ensemble.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred4))

Accuracy Score:  0.6276849642004774


In [88]:
y_probs = ensemble.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.8227209697620325
