In [1]:
import numpy as np
import pandas as pd

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
import string

# libraries for POS tagging and identification
import spacy
nlp = spacy.load('en_core_web_sm')

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from scipy import stats

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chingyiie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chingyiie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("../data/final_data.csv")

In [3]:
df.head()

Unnamed: 0,Url,Author,Date,Header,Body,n_links,Source,Bias,Quality
0,https://abcnews.go.com/Politics/abortion-right...,Devin Dwyer,"Wed, 17 Apr 2019 10:14:00 GMT",Abortion rights group asks Supreme Court to st...,Abortion rights advocates have asked the U.S. ...,3.0,ABC,1.67,49.0
1,https://abcnews.go.com/Politics/appeals-court-...,Ali Dukakis,"Tue, 26 Feb 2019 09:05:00 GMT",Appeals court says special counsel Robert Muel...,A federal appeals court rejected the most dire...,2.0,ABC,0.67,51.67
2,https://abcnews.go.com/Politics/attorney-gener...,Luke Barr,"Wed, 17 Apr 2019 14:02:00 GMT",Attorney general orders some asylum seekers to...,As part of the Trump administration's effort t...,6.0,ABC,-2.75,43.5
3,https://abcnews.go.com/Politics/donald-trump-t...,Meridith McGraw,"Tue, 19 Mar 2019 12:44:00 GMT","Donald Trump and 'the Trump of the Tropics,' B...","President Donald Trump and ""the Trump of the T...",10.0,ABC,-4.33,52.67
4,https://abcnews.go.com/Politics/electoral-coll...,Matthew Dowd,"Tue, 19 Mar 2019 21:39:00 GMT",The Electoral College limits the campaign play...,"U.S Senator Elizabeth Warren, who is competing...",5.0,ABC,-10.0,32.0


## Data Preprocessing

In [4]:
body_df = df.drop(columns=['Url', 'Author', 'Date', 'Header', 'Source', 'n_links'])

In [5]:
body_df.head()

Unnamed: 0,Body,Bias,Quality
0,Abortion rights advocates have asked the U.S. ...,1.67,49.0
1,A federal appeals court rejected the most dire...,0.67,51.67
2,As part of the Trump administration's effort t...,-2.75,43.5
3,"President Donald Trump and ""the Trump of the T...",-4.33,52.67
4,"U.S Senator Elizabeth Warren, who is competing...",-10.0,32.0


In [21]:
def process_text(text):
    # returns the lowercase
    text = text.lower()
    
    # removes punctuation
    nopunc_digit = [char for char in text if char not in string.punctuation and not char.isdigit()]
    nopunc_digit = ''.join(nopunc_digit)

    # retains adjectives and adverbs
    doc = nlp(nopunc_digit)
    adj_adv_words = [token.text for token in doc if (token.pos_ == 'ADJ' or token.pos_ == 'ADV')]
    adj_adv_words = ' '.join(adj_adv_words)
    
    # removes stopwords and lemmatizes remaining words; also gets the infinitive form of the verb; converts adv to adj (sadly -> sad)
    wnl = WordNetLemmatizer()
    lemmatized = [wnl.lemmatize(word) for word in adj_adv_words.split() if not wnl.lemmatize(word) in set(stopwords.words('english'))]
    lemmatized = ' '.join(lemmatized)
    
    return lemmatized

In [22]:
body_df['cleaned_body'] = body_df['Body'].apply(lambda x: process_text(x))

In [23]:
body_df.drop(columns=['Body', 'Quality'], inplace=True)

## Preparing Training and Testing Data

In [24]:
X = body_df.iloc[:, body_df.columns != 'Bias']
y = body_df['Bias']

In [25]:
X.head()

Unnamed: 0,cleaned_body
0,summarily controversial local temporarily repr...
1,federal direct constitutional special former s...
2,trump southern rather previous credible immedi...
3,brazilian nationalist first bilateral importan...
4,democratic national popular quickly many radic...


In [26]:
bow_transformer = CountVectorizer(max_features = 800)
bow = bow_transformer.fit_transform(X['cleaned_body'])

X = pd.DataFrame(bow.toarray(), columns=bow_transformer.get_feature_names())

In [27]:
col_names = X.columns

scaler = MinMaxScaler()
scaled = scaler.fit_transform(X)
X = pd.DataFrame(scaled, columns=col_names)

In [28]:
X.head()

Unnamed: 0,able,abroad,absolute,absolutely,academic,acceptable,accountable,accurate,active,actively,...,worse,worst,worth,wrong,yearold,yellow,yemeni,yet,young,younger
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Linear Regression

In [30]:
# Defining Model
regressor = LinearRegression()
# Training Model
regressor.fit(X_train, y_train)
# Making Predictions
y_pred = regressor.predict(X_test)
# Evaluating
print(stats.pearsonr(y_pred, y_test)[0])

0.10553457744430103


## Lasso Regression

In [31]:
# Defining Model
lasso = LassoCV(cv=5,
                alphas=np.logspace(-4,1,110),
                max_iter=10000,
                normalize=True)
# Training Model
lasso.fit(X_train, y_train)
# Number of features selected
coeffs = lasso.coef_!=0
coeff_used = np.sum(coeffs)
print('Number of coefficients: ', coeff_used)

# Fitting regressor
regressor.fit(X_train.loc[:, coeffs], y_train)
# Making predictions
y_pred = regressor.predict(X_test.loc[:, coeffs])
# Evaluating
score = stats.pearsonr(y_pred, y_test)[0]
print('Pearson coefficient: ', score)

Number of coefficients:  23
Pearson coefficient:  0.2418387989696557


In [32]:
X_test.loc[:, coeffs].columns

Index(['alive', 'antisemitic', 'civil', 'conservative', 'corporate', 'correct',
       'creative', 'gross', 'healthy', 'illegal', 'indeed', 'international',
       'liberal', 'longstanding', 'much', 'obviously', 'organized', 'oval',
       'republican', 'semiautomatic', 'sure', 'undocumented', 'unpopular'],
      dtype='object')

In [None]:
## NOTES: 
# Filtering for ADJ/ADV made model worse