## Importing Libraries

In [1]:
# Data Analysis and Processing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statistics import mean
from collections import Counter

# Natural Language Processing
### General
import nltk

import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
import string

### Sentiment Analysis
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

### Part of Speech Tagging
import spacy
nlp = spacy.load('en_core_web_sm')

# Modelling
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Evaluation
from scipy import stats

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed
[nltk_data]     (_ssl.c:777)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed
[nltk_data]     (_ssl.c:777)>
[nltk_data] Error loading vader_lexicon: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed
[nltk_data]     (_ssl.c:777)>


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

## Function Definitions

In [None]:
def get_sentiment(text):
    # gets the compound score of the sentiment using the VADER lexicon
    sid = SentimentIntensityAnalyzer()
    
    results = sid.polarity_scores(text)
    sentiment = results['compound']
    return sentiment

In [None]:
def get_entities(text):
    # gets the entities from the sentence and returns a list of them
    doc = nlp(text)
    return list(doc.ents)

In [None]:
def extract_sentences(word, text):
    # extract all sentences in text in which word appears
    sentences = [sentence for sentence in text.split('.') if word in sentence]
    return sentences

In [None]:
def extract_get_sentiment(word, text):
    # returns aggregate of sentiment for all sentences that contains word in text
    text = text.lower()
    word = word.lower()
    
    sentiments = [get_sentiment(sentence) for sentence in extract_sentences(word, text)]
    
    if len(sentiments) > 1:
        return mean(sentiments)
    return 0

## NLP Use Example

In [None]:
sentence = 'Trump is a horrible president, Trump, Bernie Sanders are equally bad'

In [None]:
get_sentiment(sentence)

In [None]:
get_entities(sentence)

In [None]:
extract_sentences('one', 'hello one who. There one')

In [None]:
extract_get_sentiment('trump', 'Trump is good. Trump sucks. Hello there')

## Importing Data

In [None]:
#df = pd.read_csv("../data/final_data.csv")

## Data Preprocessing

In [None]:
body_df = df.drop(columns=['Url', 'Author', 'Date', 'Header', 'Source', 'n_links', 'Quality'])

In [None]:
body_df.head()

In [None]:
def process_text(text):
    # 1. Lowercase text
    # 2. Removes punctuation
    # 3. Removes stopwords
    # 4. Lemmatizes remaining words

    text = text.lower()
    
    # removes punctuation
    #nopunc_digit = [char for char in text if char not in string.punctuation and not char.isdigit()]
    #nopunc_digit = ''.join(nopunc_digit)
    nopunc_digit = text
    
    # removes stopwords and lemmatizes remaining words
    wnl = WordNetLemmatizer()
    lemmatized = [wnl.lemmatize(word) for word in nopunc_digit.split() if not wnl.lemmatize(word) in set(stopwords.words('english'))]
    lemmatized = ' '.join(lemmatized)
    
    return lemmatized

In [None]:
body_df['cleaned_body'] = body_df['Body'].apply(lambda x: process_text(x))

## Saving processed df

In [None]:
#body_df.to_csv('../data/preprocessed_cleaned_body.csv', index=False)

## Loading processed df

In [None]:
body_df = pd.read_csv('../data/preprocessed_cleaned_body.csv')

## Extracting Entities

In [None]:
all_body = body_df['cleaned_body']

In [None]:
all_entities = []
for text in all_body:
    ents = get_entities(text)
    all_entities.append(ents)

In [None]:
flat_entities = [str(item) for sublist in all_entities for item in sublist]

In [None]:
ents_dict = Counter(flat_entities)
sorted_ents = {k: v for k, v in sorted(ents_dict.items(), reverse=True, key=lambda item: item[1])}

In [None]:
sorted_ents

In [None]:
unique_ents = list(sorted_ents.keys())

In [None]:
unique_ents[0:2]

## Creating sentiment columns

In [None]:
for ent in unique_ents[0:50]:
    body_df[ent] = body_df['cleaned_body'].apply(lambda x: extract_get_sentiment(ent, x))

In [None]:
body_df.head()

## Saving processed df

In [None]:
#body_df.to_csv('../data/preprocessed_entity_sentiments.csv', index=False)

## Loading processed df

In [None]:
body_df = pd.read_csv('../data/preprocessed_entity_sentiments.csv')

## Preprocessing for Model

In [None]:
X = body_df.drop(columns=['Body', 'cleaned_body', 'Bias'])
y = body_df['Bias']

In [None]:
col_names = X.columns

scaler = MinMaxScaler()
scaled = scaler.fit_transform(X)
X = pd.DataFrame(scaled, columns=col_names)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Linear Regression

In [None]:
# Defining Model
regressor = LinearRegression()
# Training Model
regressor.fit(X_train, y_train)
# Making Predictions
y_pred = regressor.predict(X_test)
# Evaluating
print(stats.pearsonr(y_pred, y_test)[0])

## Lasso Regression

In [None]:
# Defining Model
lasso = LassoCV(cv=5,
                alphas=np.logspace(-4,1,110),
                max_iter=10000,
                normalize=True)
# Training Model
lasso.fit(X_train, y_train)
# Number of features selected
coeffs = lasso.coef_!=0
coeff_used = np.sum(coeffs)
print('Number of coefficients: ', coeff_used)

# Fitting regressor
regressor.fit(X_train.loc[:, coeffs], y_train)
# Making predictions
y_pred = regressor.predict(X_test.loc[:, coeffs])
# Evaluating
score = stats.pearsonr(y_pred, y_test)[0]
print('Pearson coefficient: ', score)

In [None]:
n = 336
print(body_df['Bias'][n])
extract_get_sentiment('left', body_df['Body'][n])

In [None]:
body_df[body_df['Bias'] == max(body_df['Bias'])]