## Importing Libraries

In [1]:
# Data Analysis and Processing
import numpy as np
import pandas as pd
from collections import Counter

# Natural Language Processing
### General
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
import string

### Sentiment Analysis
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

### Part of Speech Tagging
import spacy
nlp = spacy.load('en_core_web_sm')

# Modelling
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Evaluation
from scipy import stats

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chingyiie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chingyiie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/chingyiie/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Function Definitions

In [2]:
def get_sentiment(text):
    # gets the compound score of the sentiment using the VADER lexicon
    sid = SentimentIntensityAnalyzer()
    
    results = sid.polarity_scores(text)
    sentiment = results['compound']
    return sentiment

In [27]:
def get_entities(text):
    # gets the entities from the sentence and returns a list of them
    doc = nlp(text)
    return list(doc.ents)

In [40]:
def extract_sentences(word, text):
    # extract all sentences in text in which word appears
    sentences = []
    for sentence in text.split('.'):
        if (word in sentence):
            sentences.append(sentence)
    
    return sentences

In [50]:
def extract_get_sentiment(word, text):
    # returns aggregate of sentiment for all sentences that contains word in text
    text = text.lower()
    word = word.lower()
    
    sentiment = get_sentiment('.'.join(extract_sentences(word, text)))
    return sentiment

## NLP Use Example

In [4]:
sentence = 'Trump is a horrible president, Trump, Bernie Sanders are equally bad'

In [5]:
get_sentiment(sentence)

-0.7906

In [37]:
get_entities(sentence)

[Trump, Trump, Bernie Sanders]

In [39]:
extract_sentences('one', 'hello one who. There one')

['hello one who', ' There one']

## Importing Data

In [9]:
#df = pd.read_csv("../data/final_data.csv")

## Data Preprocessing

In [10]:
body_df = df.drop(columns=['Url', 'Author', 'Date', 'Header', 'Source', 'n_links', 'Quality'])

In [11]:
body_df.head()

Unnamed: 0,Body,Bias
0,Abortion rights advocates have asked the U.S. ...,1.67
1,A federal appeals court rejected the most dire...,0.67
2,As part of the Trump administration's effort t...,-2.75
3,"President Donald Trump and ""the Trump of the T...",-4.33
4,"U.S Senator Elizabeth Warren, who is competing...",-10.0


In [12]:
def process_text(text):
    # 1. Lowercase text
    # 2. Removes punctuation
    # 3. Removes stopwords
    # 4. Lemmatizes remaining words

    text = text.lower()
    
    # removes punctuation
    nopunc_digit = [char for char in text if char not in string.punctuation and not char.isdigit()]
    nopunc_digit = ''.join(nopunc_digit)
    
    # removes stopwords and lemmatizes remaining words
    wnl = WordNetLemmatizer()
    lemmatized = [wnl.lemmatize(word) for word in nopunc_digit.split() if not wnl.lemmatize(word) in set(stopwords.words('english'))]
    lemmatized = ' '.join(lemmatized)
    
    return lemmatized

In [13]:
body_df['cleaned_body'] = body_df['Body'].apply(lambda x: process_text(x))

## Saving processed df

In [14]:
#body_df.to_csv('preprocess1.csv')

## Loading processed df

In [7]:
body_df = pd.read_csv('preprocess1.csv')

## Extracting Entities

In [8]:
all_body = body_df['cleaned_body']

In [9]:
all_entities = []
for text in all_body:
    ents = get_entities(text)
    all_entities.append(ents)

In [10]:
flat_entities = [str(item) for sublist in all_entities for item in sublist]

In [11]:
ents_dict = Counter(flat_entities)
sorted_ents = {k: v for k, v in sorted(ents_dict.items(), reverse=True, key=lambda item: item[1])}

In [12]:
sorted_ents

{'one': 2111,
 'american': 1256,
 'trump': 1182,
 'two': 1158,
 'republican': 1127,
 'democrat': 1093,
 'first': 1006,
 'cohen': 965,
 'tuesday': 951,
 'congress': 925,
 'russian': 708,
 'new york': 634,
 'white house': 587,
 'russia': 584,
 'donald trump': 553,
 'democratic': 528,
 'today': 503,
 'united state': 491,
 'senate': 458,
 'million': 440,
 'doe': 436,
 'last year': 435,
 'three': 422,
 'washington': 419,
 'america': 413,
 'second': 400,
 'monday': 398,
 'fbi': 383,
 'china': 382,
 'wednesday': 343,
 'north korea': 330,
 'year': 323,
 'last week': 316,
 'house': 275,
 'muslim': 262,
 'february': 258,
 'new zealand': 257,
 'california': 255,
 'april': 254,
 'kim': 233,
 'u': 225,
 'cnn': 216,
 'barr': 215,
 'four': 205,
 'thursday': 204,
 'gop': 204,
 'march': 203,
 'chinese': 202,
 'clinton': 201,
 'florida': 195,
 'fox news': 188,
 'january': 187,
 'venezuela': 182,
 'michael cohen': 182,
 'israel': 169,
 'moscow': 163,
 'hillary clinton': 162,
 'yemen': 162,
 'iran': 162,


In [13]:
unique_ents = list(sorted_ents.keys())

In [17]:
unique_ents[0:2]

['one', 'american']

## Creating sentiment columns

In [None]:
for ent in unique_ents[0:50]:
    body_df[ent] = body_df['Body'].apply(lambda x: extract_get_sentiment(ent, x))

In [None]:
body_df.head()

## Saving processed df

In [None]:
body_df.to_csv('preprocess2.csv')

## Loading processed df

## Regression Model