### NLTK sentiment analysis using nlp utils

In [38]:
# This file contains functions that helps with NLP processing                             
from statistics import mean

# Natural Language Processing                                                             
### General                                                                               
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/monicabellare/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/monicabellare/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [39]:
### Sentiment Analysis                                                                    
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

### Part of Speech Tagging                                                                
# import spacy                                                                            
# nlp = spacy.load('en_core_web_sm')  

def get_sentiment(text):
    # gets the compound score of the sentiment using the VADER lexicon                    
    sid = SentimentIntensityAnalyzer()

    results = sid.polarity_scores(text)
    sentiment = results['compound']
    return sentiment

def get_entities(text):
    # gets the entities from the sentence and returns a list of them                      
    doc = nlp(text)
    return list(doc.ents)

def extract_sentences(word, text):
    # extract all sentences in text in which word appears                                 
    sentences = [sentence for sentence in text.split('.') if word in sentence]
    return sentences

def extract_get_sentiment(word, text):
    # returns aggregate of sentiment for all sentences that contains word in text         
    text = text.lower()
    word = word.lower()

    sentiments = [get_sentiment(sentence) for sentence in extract_sentences(word, text)]

    if len(sentiments) > 1:
        return mean(sentiments)
    return 0

def process_text(text):
    # 1. Lowercase text                                                                   
    # 2. Removes punctuation                                                              
    # 3. Removes digits                                                                   
    # 4. Removes stopwords                                                                
    # 5. Lemmatizes remaining words                                                       

    text = text.lower()

    nopunc_digit = [char for char in text if char not in string.punctuation and not char.isdigit()]
    nopunc_digit = ''.join(nopunc_digit)

    wnl = WordNetLemmatizer()
    lemmatized = [wnl.lemmatize(word) for word in nopunc_digit.split() if not wnl.lemmatize(word) in set(stopwords.words('english'))]
    lemmatized = ' '.join(lemmatized)

    return lemmatized

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/monicabellare/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [41]:
df = pd.read_csv("../data/final_data.csv")

In [42]:
df.head()

Unnamed: 0,Url,Author,Date,Header,Body,n_links,Source,Bias,Quality
0,https://abcnews.go.com/Politics/abortion-right...,Devin Dwyer,"Wed, 17 Apr 2019 10:14:00 GMT",Abortion rights group asks Supreme Court to st...,Abortion rights advocates have asked the U.S. ...,3.0,ABC,1.67,49.0
1,https://abcnews.go.com/Politics/appeals-court-...,Ali Dukakis,"Tue, 26 Feb 2019 09:05:00 GMT",Appeals court says special counsel Robert Muel...,A federal appeals court rejected the most dire...,2.0,ABC,0.67,51.67
2,https://abcnews.go.com/Politics/attorney-gener...,Luke Barr,"Wed, 17 Apr 2019 14:02:00 GMT",Attorney general orders some asylum seekers to...,As part of the Trump administration's effort t...,6.0,ABC,-2.75,43.5
3,https://abcnews.go.com/Politics/donald-trump-t...,Meridith McGraw,"Tue, 19 Mar 2019 12:44:00 GMT","Donald Trump and 'the Trump of the Tropics,' B...","President Donald Trump and ""the Trump of the T...",10.0,ABC,-4.33,52.67
4,https://abcnews.go.com/Politics/electoral-coll...,Matthew Dowd,"Tue, 19 Mar 2019 21:39:00 GMT",The Electoral College limits the campaign play...,"U.S Senator Elizabeth Warren, who is competing...",5.0,ABC,-10.0,32.0


In [43]:
x = df.drop(columns=['Url','Author','Date','Body','n_links','Source','Bias','Quality'])

In [44]:
print(x)

                                                 Header
0     Abortion rights group asks Supreme Court to st...
1     Appeals court says special counsel Robert Muel...
2     Attorney general orders some asylum seekers to...
3     Donald Trump and 'the Trump of the Tropics,' B...
4     The Electoral College limits the campaign play...
...                                                 ...
1670  12 French churches attacked before Notre Dame ...
1671  DOJ sued for details of payments to Christophe...
1672  Fox News stars pull plug on history of church ...
1673    Major U.S. bank shuts down 'alt-right' accounts
1674    Schiff launches next front in war against Trump

[1675 rows x 1 columns]


In [45]:
header_sent = []

In [46]:
print(header_sent)

[]


In [47]:
for i in range(0,1674):
    header_sent.append(get_sentiment(str(x.loc[[i]])))

In [48]:
print(header_sent)

[0.5574, 0.4019, 0.0, 0.0, 0.0, 0.0258, 0.0, 0.0, 0.0, 0.0, 0.0, -0.4404, 0.0, 0.0, 0.0, 0.0, 0.0, -0.2263, 0.0, -0.5423, 0.0, 0.0772, 0.0, -0.4404, -0.6249, 0.0, -0.4404, 0.4588, 0.0, 0.2732, -0.1531, -0.4939, 0.0, 0.0, 0.0, -0.4215, 0.0, 0.0, 0.0, 0.2023, 0.0, 0.0, 0.2023, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3612, -0.2732, 0.0, 0.0, -0.4215, 0.0, -0.5106, 0.0, -0.4767, 0.0, 0.0516, 0.0, -0.2263, 0.0, 0.128, -0.6124, 0.0, 0.5267, 0.2023, -0.5106, -0.2023, 0.0, 0.2263, 0.0, -0.4939, 0.0, 0.0, 0.0, -0.4019, 0.0, 0.0, 0.0, 0.0, 0.5106, 0.0, 0.0, -0.6249, -0.3818, 0.0, 0.0, 0.0, -0.7096, -0.5106, 0.5574, 0.4019, 0.0, -0.5994, 0.4939, 0.0, 0.0, 0.3612, -0.2732, 0.0, -0.4215, 0.0, -0.5106, 0.0, -0.4767, 0.0, 0.0516, 0.0, -0.2263, 0.0, 0.128, -0.6124, 0.0, 0.5267, 0.2023, -0.5106, -0.2023, 0.0, 0.2263, 0.0, -0.4939, 0.0, 0.0, 0.0, -0.4019, 0.0, 0.0, 0.0, 0.0, 0.5106, 0.0, 0.0, -0.6249, 0.0, 0.0, 0.0, 0.0, -0.7096, -0.5106, 0.5574, 0.4019, 0.6369, -0.5994, 0.4939, 0.0, 0.0, -0.5423, 0.4404, 0.0, 

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import KFold

from scipy import stats
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.metrics import f1_score, classification_report

In [62]:
dfnew = df.rename(columns={'Bias': 'Inital_Bias'})

In [68]:
print(list(header_sent))

[0.5574, 0.4019, 0.0, 0.0, 0.0, 0.0258, 0.0, 0.0, 0.0, 0.0, 0.0, -0.4404, 0.0, 0.0, 0.0, 0.0, 0.0, -0.2263, 0.0, -0.5423, 0.0, 0.0772, 0.0, -0.4404, -0.6249, 0.0, -0.4404, 0.4588, 0.0, 0.2732, -0.1531, -0.4939, 0.0, 0.0, 0.0, -0.4215, 0.0, 0.0, 0.0, 0.2023, 0.0, 0.0, 0.2023, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3612, -0.2732, 0.0, 0.0, -0.4215, 0.0, -0.5106, 0.0, -0.4767, 0.0, 0.0516, 0.0, -0.2263, 0.0, 0.128, -0.6124, 0.0, 0.5267, 0.2023, -0.5106, -0.2023, 0.0, 0.2263, 0.0, -0.4939, 0.0, 0.0, 0.0, -0.4019, 0.0, 0.0, 0.0, 0.0, 0.5106, 0.0, 0.0, -0.6249, -0.3818, 0.0, 0.0, 0.0, -0.7096, -0.5106, 0.5574, 0.4019, 0.0, -0.5994, 0.4939, 0.0, 0.0, 0.3612, -0.2732, 0.0, -0.4215, 0.0, -0.5106, 0.0, -0.4767, 0.0, 0.0516, 0.0, -0.2263, 0.0, 0.128, -0.6124, 0.0, 0.5267, 0.2023, -0.5106, -0.2023, 0.0, 0.2263, 0.0, -0.4939, 0.0, 0.0, 0.0, -0.4019, 0.0, 0.0, 0.0, 0.0, 0.5106, 0.0, 0.0, -0.6249, 0.0, 0.0, 0.0, 0.0, -0.7096, -0.5106, 0.5574, 0.4019, 0.6369, -0.5994, 0.4939, 0.0, 0.0, -0.5423, 0.4404, 0.0, 

In [101]:
for i in range(0,1674):
    dfnew['Header_Sent'][i] = header_sent[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [102]:
dfnew['Header_Sent'][1674] = 0.0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [117]:
# 1 represents most left, 3 represents most right
bins = [-1, -0.5, 0.5, 1]
names = ['1', '2', '3']

multi_df = dfnew.loc[:]
multi_df['Category'] = pd.cut(multi_df['Header_Sent'], bins, labels=names)

In [118]:
tfidf_transformer = TfidfVectorizer(max_features = 800)
tfidf = tfidf_transformer.fit_transform(multi_df['Header'])

X = pd.DataFrame(tfidf.toarray(), columns=tfidf_transformer.get_feature_names())
y = multi_df['Category']

In [119]:
col_names = X.columns

scaler = MinMaxScaler()
scaled = scaler.fit_transform(X)
X = pd.DataFrame(scaled, columns=col_names)

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

# Naive Bayes

In [124]:
# Defining Model
mnb = MultinomialNB()
# Training Model
mnb.fit(X_train, y_train)
# Making Predictions
y_pred = mnb.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred))

Accuracy Score:  0.8448687350835322


In [125]:
y_probs = mnb.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.6528944911297853


# Logistic Regression

In [126]:
# Defining Model
regressor = LogisticRegression()
# Training Model
regressor.fit(X_train, y_train)
# Making Predictions
y_pred2 = regressor.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred2))

Accuracy Score:  0.8568019093078759


In [127]:
y_probs = regressor.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.6947742865262047


# SVM

In [128]:
# Defining Model
clf = SVC(probability=True)
# Training Model
clf.fit(X_train, y_train)
# Making Predictions
y_pred3 = clf.predict(X_test)
# Evaluating
print("Accuracy Score: ", accuracy_score(y_test, y_pred3))

Accuracy Score:  0.8520286396181385


In [129]:
y_probs = clf.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_probs, average="macro", multi_class="ovo")
print('ROC Score is: ', roc_auc)

ROC Score is:  0.7005750669833151


# Ensemble

In [130]:
results = pd.DataFrame({'pred1': y_pred,
                        'pred2': y_pred2,
                        'pred3': y_pred3})

In [131]:
results['final'] = results.mode(axis=1)[0]
results.head()

Unnamed: 0,pred1,pred2,pred3,final
0,2,2,2,2
1,2,2,2,2
2,2,2,2,2
3,2,2,2,2
4,2,2,2,2


In [132]:
final_pred = results['final']
print("Accuracy Score: ", accuracy_score(y_test, final_pred))

Accuracy Score:  0.8544152744630071


In [133]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.54      0.15      0.24        46
           2       0.87      0.96      0.91       357
           3       0.30      0.19      0.23        16

    accuracy                           0.84       419
   macro avg       0.57      0.43      0.46       419
weighted avg       0.81      0.84      0.81       419



In [134]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           1       0.60      0.13      0.21        46
           2       0.87      0.98      0.92       357
           3       0.50      0.25      0.33        16

    accuracy                           0.86       419
   macro avg       0.66      0.45      0.49       419
weighted avg       0.83      0.86      0.82       419

