In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import KFold

from scipy import stats
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.metrics import f1_score, classification_report

In [2]:
import data_transformation as dt

['Questionable Sources' 'Least Biased' 'Left' 'Right'
 'Conspiracy-Pseudoscience' 'Right-Center' 'Left-Center' 'Satire'
 'Pro-Science']


In [3]:
final_data = pd.read_csv('../data/final_data.csv')

In [4]:
cleaned_df = pd.read_csv('../data/preprocessed_cleaned_body.csv')

In [5]:
final_data = pd.concat([cleaned_df['cleaned_body'], final_data], axis=1)

In [6]:
final_data = dt.merge_ratings(final_data)

### No Body Analysis (only MBFC Bias Rating)

In [7]:
df = pd.concat([final_data['Body'], final_data['Bias'], final_data['Bias Rating']], axis=1)
df

Unnamed: 0,Body,Bias,Bias Rating
0,Abortion rights advocates have asked the U.S. ...,1.67,-3.5
1,A federal appeals court rejected the most dire...,0.67,-3.5
2,As part of the Trump administration's effort t...,-2.75,-3.5
3,"President Donald Trump and ""the Trump of the T...",-4.33,-3.5
4,"U.S Senator Elizabeth Warren, who is competing...",-10.00,-3.5
...,...,...,...
1476,"Mr. Krueger, a labor economist who wrote paper...",-3.50,3.5
1477,Since January 2018 the FAA has been run by act...,2.75,3.5
1478,Things went downhill from there.\nAfter he ins...,1.75,3.5
1479,Qualcomm had claimed Apple was violating its p...,0.50,3.5


In [8]:
bins = [-41, -5, 5, 41]
names = ['1', '2', '3']

multi_df = df.loc[:]
multi_df['Category'] = pd.cut(multi_df['Bias'], bins, labels=names)

In [9]:
multi_df

Unnamed: 0,Body,Bias,Bias Rating,Category
0,Abortion rights advocates have asked the U.S. ...,1.67,-3.5,2
1,A federal appeals court rejected the most dire...,0.67,-3.5,2
2,As part of the Trump administration's effort t...,-2.75,-3.5,2
3,"President Donald Trump and ""the Trump of the T...",-4.33,-3.5,2
4,"U.S Senator Elizabeth Warren, who is competing...",-10.00,-3.5,1
...,...,...,...,...
1476,"Mr. Krueger, a labor economist who wrote paper...",-3.50,3.5,2
1477,Since January 2018 the FAA has been run by act...,2.75,3.5,2
1478,Things went downhill from there.\nAfter he ins...,1.75,3.5,2
1479,Qualcomm had claimed Apple was violating its p...,0.50,3.5,2


In [10]:
X = pd.DataFrame(multi_df['Bias Rating'])
y = multi_df['Category']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [12]:
# Defining Model
regressor = LogisticRegression()
# Training Model
regressor.fit(X_train, y_train)
# Making Predictions
y_pred2 = regressor.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred2))

Accuracy Score:  0.6972477064220184


In [13]:
y_probs = regressor.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.8693187509675538


In [14]:
# Defining Model
clf = SVC(probability=True)
# Training Model
clf.fit(X_train, y_train)
# Making Predictions
y_pred3 = clf.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred3))

Accuracy Score:  0.6972477064220184


In [15]:
y_probs = clf.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.8673631537249689


### Body and Bias Rating Anaylsis

In [16]:
df = pd.concat([final_data['cleaned_body'], final_data['Bias'], final_data['Bias Rating']], axis=1)
df

Unnamed: 0,cleaned_body,Bias,Bias Rating
0,abortion right advocate asked u supreme court ...,1.67,-3.5
1,federal appeal court rejected direct constitut...,0.67,-3.5
2,part trump administration effort slow migrant ...,-2.75,-3.5
3,president donald trump trump tropic brazilian ...,-4.33,-3.5
4,u senator elizabeth warren competing democrati...,-10.00,-3.5
...,...,...,...
1476,mr krueger labor economist wrote paper topic i...,-3.50,3.5
1477,since january faa ha run acting chief daniel e...,2.75,3.5
1478,thing went downhill installed decorative shinh...,1.75,3.5
1479,qualcomm claimed apple wa violating patent wit...,0.50,3.5


In [17]:
bins = [-41, -5, 5, 41]
names = ['1', '2', '3']

multi_df = df.loc[:]
multi_df['Category'] = pd.cut(multi_df['Bias'], bins, labels=names)

multi_df = multi_df.reset_index()
multi_df = multi_df.drop('index', axis=1)

In [18]:
tfidf_transformer = TfidfVectorizer(max_features = 800)
tfidf = tfidf_transformer.fit_transform(multi_df['cleaned_body'])

X = pd.concat([pd.DataFrame(tfidf.toarray(), columns=tfidf_transformer.get_feature_names()), multi_df['Bias Rating']], axis=1, ignore_index=True)
#X = pd.DataFrame(tfidf.toarray(), columns=tfidf_transformer.get_feature_names())
y = multi_df['Category']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [20]:
# Defining Model
regressor = LogisticRegression(max_iter=200)
# Training Model
regressor.fit(X_train, y_train)
# Making Predictions
y_pred2 = regressor.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred2))

Accuracy Score:  0.7706422018348624


In [21]:
y_probs = regressor.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.9204129975503811


In [22]:
# Defining Model
clf = SVC(probability=True)
# Training Model
clf.fit(X_train, y_train)
# Making Predictions
y_pred3 = clf.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred3))

Accuracy Score:  0.6972477064220184


In [23]:
y_probs = clf.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.8995398237609997
