## Importing Libraries

In [53]:
# Data Analysis and Processing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statistics import mean
from collections import Counter

# Natural Language Processing
### General
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
import string

### Sentiment Analysis
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

### Part of Speech Tagging
import spacy
nlp = spacy.load('en_core_web_sm')

# Modelling
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Evaluation
from scipy import stats

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chingyiie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chingyiie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/chingyiie/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Function Definitions

In [2]:
def get_sentiment(text):
    # gets the compound score of the sentiment using the VADER lexicon
    sid = SentimentIntensityAnalyzer()
    
    results = sid.polarity_scores(text)
    sentiment = results['compound']
    return sentiment

In [3]:
def get_entities(text):
    # gets the entities from the sentence and returns a list of them
    doc = nlp(text)
    return list(doc.ents)

In [4]:
def extract_sentences(word, text):
    # extract all sentences in text in which word appears
    sentences = [sentence for sentence in text.split('.') if word in sentence]
    return sentences

In [68]:
def extract_get_sentiment(word, text):
    # returns aggregate of sentiment for all sentences that contains word in text
    text = text.lower()
    word = word.lower()
    
    sentiments = [get_sentiment(sentence) for sentence in extract_sentences(word, text)]
    
    if len(sentiments) > 1:
        return mean(sentiments)
    return 0

## NLP Use Example

In [6]:
sentence = 'Trump is a horrible president, Trump, Bernie Sanders are equally bad'

In [7]:
get_sentiment(sentence)

-0.7906

In [8]:
get_entities(sentence)

[Trump, Trump, Bernie Sanders]

In [49]:
extract_sentences('one', 'hello one who. There one')

['hello one who', ' There one']

In [60]:
extract_get_sentiment('trump', 'Trump is good. Trump sucks. Hello there')

0.039599999999999996

## Importing Data

In [10]:
#df = pd.read_csv("../data/final_data.csv")

## Data Preprocessing

In [12]:
body_df = df.drop(columns=['Url', 'Author', 'Date', 'Header', 'Source', 'n_links', 'Quality'])

In [13]:
body_df.head()

Unnamed: 0,Body,Bias
0,Abortion rights advocates have asked the U.S. ...,1.67
1,A federal appeals court rejected the most dire...,0.67
2,As part of the Trump administration's effort t...,-2.75
3,"President Donald Trump and ""the Trump of the T...",-4.33
4,"U.S Senator Elizabeth Warren, who is competing...",-10.0


In [123]:
def process_text(text):
    # 1. Lowercase text
    # 2. Removes punctuation
    # 3. Removes stopwords
    # 4. Lemmatizes remaining words

    text = text.lower()
    
    # removes punctuation
    #nopunc_digit = [char for char in text if char not in string.punctuation and not char.isdigit()]
    #nopunc_digit = ''.join(nopunc_digit)
    nopunc_digit = text
    
    # removes stopwords and lemmatizes remaining words
    wnl = WordNetLemmatizer()
    lemmatized = [wnl.lemmatize(word) for word in nopunc_digit.split() if not wnl.lemmatize(word) in set(stopwords.words('english'))]
    lemmatized = ' '.join(lemmatized)
    
    return lemmatized

In [124]:
body_df['cleaned_body'] = body_df['Body'].apply(lambda x: process_text(x))

## Saving processed df

In [17]:
#body_df.to_csv('../data/preprocess1.csv', index=False)

## Loading processed df

In [18]:
body_df = pd.read_csv('../data/preprocess1.csv')

## Extracting Entities

In [125]:
all_body = body_df['cleaned_body']

In [126]:
all_entities = []
for text in all_body:
    ents = get_entities(text)
    all_entities.append(ents)

In [127]:
flat_entities = [str(item) for sublist in all_entities for item in sublist]

In [128]:
ents_dict = Counter(flat_entities)
sorted_ents = {k: v for k, v in sorted(ents_dict.items(), reverse=True, key=lambda item: item[1])}

In [129]:
sorted_ents

{'one': 2298,
 'trump': 2104,
 'u.s.': 1606,
 'two': 1208,
 'cohen': 1179,
 'american': 1169,
 'first': 1109,
 'republican': 1039,
 'democrat': 965,
 'congress': 964,
 'tuesday': 954,
 'russian': 696,
 '2016': 608,
 'russia': 604,
 'white house': 601,
 'new york': 598,
 'donald trump': 583,
 'democratic': 535,
 'today': 515,
 'senate': 465,
 'three': 454,
 'washington': 453,
 'second': 448,
 'doe': 421,
 'america': 418,
 'last year': 407,
 'china': 398,
 'monday': 393,
 'fbi': 362,
 'house': 357,
 'united state': 355,
 '2020': 352,
 'north korea': 330,
 'wednesday': 329,
 'barr': 314,
 'last week': 306,
 '2018': 282,
 'california': 281,
 'kim': 263,
 '2017': 262,
 'muslim': 252,
 'new zealand': 252,
 'cnn': 235,
 'wa': 234,
 'mueller': 224,
 'united states': 219,
 'four': 218,
 'clinton': 214,
 'chinese': 211,
 'gop': 208,
 'florida': 205,
 'boeing': 205,
 'third': 201,
 'democrats': 200,
 'venezuela': 199,
 'thursday': 196,
 'omar': 191,
 'supreme court': 188,
 'michael cohen': 182,
 

In [130]:
unique_ents = list(sorted_ents.keys())

In [131]:
unique_ents[0:2]

['one', 'trump']

## Creating sentiment columns

In [132]:
for ent in unique_ents[0:50]:
    body_df[ent] = body_df['cleaned_body'].apply(lambda x: extract_get_sentiment(ent, x))

In [133]:
body_df.head()

Unnamed: 0,Body,Bias,cleaned_body,one,american,trump,two,republican,democrat,first,...,clinton,florida,u.s.,2016,2020,2018,2017,wa,mueller,united states
0,Abortion rights advocates have asked the U.S. ...,1.67,abortion right advocate asked u.s. supreme cou...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.401,0.0,0.0
1,A federal appeals court rejected the most dire...,0.67,federal appeal court rejected direct constitut...,0.171917,0.0,0.3592,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0.56015,0.0,0.0,0.0,0.193687,0.196287,0.0
2,As part of the Trump administration's effort t...,-2.75,part trump administration's effort slow migran...,0.0,0.0,-0.224025,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,-0.20095,0.0,-0.195829,0.0,0.0
3,"President Donald Trump and ""the Trump of the T...",-4.33,"president donald trump ""the trump tropics,"" br...",0.5399,0.0,0.328425,0.2977,0.0,0.0,0.414833,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.385815,0.0,0.841
4,"U.S Senator Elizabeth Warren, who is competing...",-10.0,"u.s senator elizabeth warren, competing democr...",0.24794,0.5155,0.0,0.44895,0.0,0.37385,0.33525,...,0.0,0.0,0,0.0,0.4117,0.0,0.0,0.384791,0.0,0.0


## Saving processed df

In [72]:
#body_df.to_csv('../data/preprocess2.csv', index=False)

## Loading processed df

In [73]:
body_df = pd.read_csv('../data/preprocess2.csv')

## Preprocessing for Model

In [134]:
X = body_df.drop(columns=['Body', 'cleaned_body', 'Bias'])
y = body_df['Bias']

In [135]:
col_names = X.columns

scaler = MinMaxScaler()
scaled = scaler.fit_transform(X)
X = pd.DataFrame(scaled, columns=col_names)

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Linear Regression

In [137]:
# Defining Model
regressor = LinearRegression()
# Training Model
regressor.fit(X_train, y_train)
# Making Predictions
y_pred = regressor.predict(X_test)
# Evaluating
print(stats.pearsonr(y_pred, y_test)[0])

0.006199810307246947


## Lasso Regression

In [138]:
# Defining Model
lasso = LassoCV(cv=5,
                alphas=np.logspace(-4,1,110),
                max_iter=10000,
                normalize=True)
# Training Model
lasso.fit(X_train, y_train)
# Number of features selected
coeffs = lasso.coef_!=0
coeff_used = np.sum(coeffs)
print('Number of coefficients: ', coeff_used)

# Fitting regressor
regressor.fit(X_train.loc[:, coeffs], y_train)
# Making predictions
y_pred = regressor.predict(X_test.loc[:, coeffs])
# Evaluating
score = stats.pearsonr(y_pred, y_test)[0]
print('Pearson coefficient: ', score)

Number of coefficients:  0


ValueError: Found array with 0 feature(s) (shape=(1256, 0)) while a minimum of 1 is required.

In [120]:
n = 336
print(body_df['Bias'][n])
extract_get_sentiment('left', body_df['Body'][n])

40.0


-0.12184285714285714

In [None]:
body_df[body_df['Bias'] == max(body_df['Bias'])]