In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import KFold

from scipy import stats
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.metrics import f1_score, classification_report

In [2]:
import data_transformation as dt

['Questionable Sources' 'Least Biased' 'Left' 'Right'
 'Conspiracy-Pseudoscience' 'Right-Center' 'Left-Center' 'Satire'
 'Pro-Science']


In [3]:
final_data = pd.read_csv('../data/final_data.csv')

In [4]:
cleaned_df = pd.read_csv('../data/preprocessed_cleaned_body.csv')

In [5]:
final_data = pd.concat([cleaned_df['cleaned_body'], final_data], axis=1)

In [6]:
final_data = dt.merge_ratings(final_data)

### No Body Analysis (only MBFC Bias Rating)

In [7]:
df = pd.concat([final_data['Body'], final_data['Bias'], final_data['Bias Rating']], axis=1)
df

Unnamed: 0,Body,Bias,Bias Rating
0,Abortion rights advocates have asked the U.S. ...,1.67,-3.5
1,A federal appeals court rejected the most dire...,0.67,-3.5
2,As part of the Trump administration's effort t...,-2.75,-3.5
3,"President Donald Trump and ""the Trump of the T...",-4.33,-3.5
4,"U.S Senator Elizabeth Warren, who is competing...",-10.00,-3.5
...,...,...,...
1431,Drew Angerer/Getty Images\nDuring Sen. Elizabe...,-11.00,-7.0
1432,Alex Wong/Getty Images\nImmigration and Custom...,-6.33,-7.0
1433,Photofusion/Universal Images Group via Getty I...,-9.00,-7.0
1434,David McNew/Getty Images\nAround 15 minutes af...,-3.00,-7.0


In [8]:
bins = [-41, -5, 5, 41]
names = ['1', '2', '3']

multi_df = df.loc[:]
multi_df['Category'] = pd.cut(multi_df['Bias'], bins, labels=names)

In [9]:
multi_df

Unnamed: 0,Body,Bias,Bias Rating,Category
0,Abortion rights advocates have asked the U.S. ...,1.67,-3.5,2
1,A federal appeals court rejected the most dire...,0.67,-3.5,2
2,As part of the Trump administration's effort t...,-2.75,-3.5,2
3,"President Donald Trump and ""the Trump of the T...",-4.33,-3.5,2
4,"U.S Senator Elizabeth Warren, who is competing...",-10.00,-3.5,1
...,...,...,...,...
1431,Drew Angerer/Getty Images\nDuring Sen. Elizabe...,-11.00,-7.0,1
1432,Alex Wong/Getty Images\nImmigration and Custom...,-6.33,-7.0,1
1433,Photofusion/Universal Images Group via Getty I...,-9.00,-7.0,1
1434,David McNew/Getty Images\nAround 15 minutes af...,-3.00,-7.0,2


In [10]:
X = pd.DataFrame(multi_df['Bias Rating'])
y = multi_df['Category']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [12]:
# Defining Model
regressor = LogisticRegression()
# Training Model
regressor.fit(X_train, y_train)
# Making Predictions
y_pred2 = regressor.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred2))

Accuracy Score:  0.6698412698412698


In [13]:
y_probs = regressor.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.8667961349451353


In [14]:
# Defining Model
clf = SVC(probability=True)
# Training Model
clf.fit(X_train, y_train)
# Making Predictions
y_pred3 = clf.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred3))

Accuracy Score:  0.6698412698412698


In [15]:
y_probs = clf.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.851510783814021


### Body and Bias Rating Anaylsis

In [47]:
scaler = MinMaxScaler()

In [48]:
scaled_ratings = scaler.fit_transform(final_data[['Bias Rating']]) #need two sets of [] here because that selects it as a dataframe
scaled_ratings = pd.DataFrame(scaled_ratings, columns=['Bias Rating'])
scaled_ratings

Unnamed: 0,Bias Rating
0,0.25
1,0.25
2,0.25
3,0.25
4,0.25
...,...
1255,0.00
1256,0.00
1257,0.00
1258,0.00


In [49]:
df = pd.concat([final_data['cleaned_body'], final_data['Bias'], final_data['Bias Rating']], axis=1)
#df = pd.concat([final_data['cleaned_body'], final_data['Bias'], scaled_ratings], axis=1)
df

Unnamed: 0,cleaned_body,Bias,Bias Rating
0,abortion right advocate asked u supreme court ...,1.67,-3.5
1,federal appeal court rejected direct constitut...,0.67,-3.5
2,part trump administration effort slow migrant ...,-2.75,-3.5
3,president donald trump trump tropic brazilian ...,-4.33,-3.5
4,u senator elizabeth warren competing democrati...,-10.00,-3.5
...,...,...,...
1431,drew angerergetty image sen elizabeth warren’s...,-11.00,-7.0
1432,alex wonggetty image immigration custom enforc...,-6.33,-7.0
1433,photofusionuniversal image group via getty ima...,-9.00,-7.0
1434,david mcnewgetty image around minute mammal’s ...,-3.00,-7.0


In [57]:
bins = [-41, -5, 5, 41]
names = ['1', '2', '3']

multi_df = df.loc[:]
multi_df['Category'] = pd.cut(multi_df['Bias'], bins, labels=names)

multi_df = multi_df.reset_index()
multi_df = multi_df.drop('index', axis=1)

In [60]:
multi_df['Bias Rating'] = scaled_ratings
multi_df

Unnamed: 0,cleaned_body,Bias,Bias Rating,Category
0,abortion right advocate asked u supreme court ...,1.67,0.25,2
1,federal appeal court rejected direct constitut...,0.67,0.25,2
2,part trump administration effort slow migrant ...,-2.75,0.25,2
3,president donald trump trump tropic brazilian ...,-4.33,0.25,2
4,u senator elizabeth warren competing democrati...,-10.00,0.25,1
...,...,...,...,...
1255,drew angerergetty image sen elizabeth warren’s...,-11.00,0.00,1
1256,alex wonggetty image immigration custom enforc...,-6.33,0.00,1
1257,photofusionuniversal image group via getty ima...,-9.00,0.00,1
1258,david mcnewgetty image around minute mammal’s ...,-3.00,0.00,2


In [61]:
tfidf_transformer = TfidfVectorizer(max_features = 800)
tfidf = tfidf_transformer.fit_transform(multi_df['cleaned_body'])

X = pd.concat([pd.DataFrame(tfidf.toarray(), columns=tfidf_transformer.get_feature_names()), multi_df['Bias Rating']], axis=1, ignore_index=True)
#X = pd.DataFrame(tfidf.toarray(), columns=tfidf_transformer.get_feature_names())
y = multi_df['Category']

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [67]:
# Defining Model
regressor = LogisticRegression(max_iter=200)
# Training Model
regressor.fit(X_train, y_train)
# Making Predictions
y_pred4 = regressor.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred4))

Accuracy Score:  0.7523809523809524


In [68]:
y_probs = regressor.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.9237513583951222


In [69]:
# Defining Model
clf = SVC(probability=True)
# Training Model
clf.fit(X_train, y_train)
# Making Predictions
y_pred5 = clf.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred5))

Accuracy Score:  0.7523809523809524


In [70]:
y_probs = clf.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.9215649032373882


In [71]:
# Defining Model
mnb = MultinomialNB()
# Training Model
mnb.fit(X_train, y_train)
# Making Predictions
y_pred6 = mnb.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred6))

Accuracy Score:  0.6984126984126984


In [72]:
y_probs = mnb.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.8721670048629745


### Just Word Analysis

In [25]:
bins = [-41, -5, 5, 41]
names = ['1', '2', '3']

multi_df = df.loc[:]
multi_df['Category'] = pd.cut(multi_df['Bias'], bins, labels=names)

multi_df = multi_df.reset_index()
multi_df = multi_df.drop('index', axis=1)