# Sentiment Analysis 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Custom Code imports
import sys
sys.path.insert(0, '/Users/briankalinowski/PycharmProjects/FakeNewsChallenge/Code')
import SentimentAnalysisUtils as sentUtils

## Fake & Real News Data 

In [2]:
news_df = pd.read_csv('/Users/briankalinowski/Desktop/Data/news_content_lemma.csv')
news_df.head()

Unnamed: 0,title,text,tokenized_headline,tokenized_content,type,valid_score
0,Muslims BUSTED They Stole Millions In Govt Ben...,Print They should pay all the back all the mon...,muslims bust steal millions in govt benefit,print should pay all the back all the money pl...,bias,0
1,Re Why Did Attorney General Loretta Lynch Plea...,Why Did Attorney General Loretta Lynch Plead T...,re why do attorney general loretta lynch plead...,why do attorney general loretta lynch plead th...,bias,0
2,BREAKING Weiner Cooperating With FBI On Hillar...,Red State Fox News Sunday reported this mornin...,break weiner cooperate with fbi on hillary ema...,red state fox news sunday report this morning ...,bias,0
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,pin drop speech by father of daughter kidnappe...,email kayla mueller be a prisoner and torture ...,bias,0
4,FANTASTIC! TRUMPS 7 POINT PLAN To Reform Healt...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,fantastic trump 7 point plan to reform healthc...,email healthcare reform to make america great ...,bias,0


## Preform Sentiment Scoring

- `SentimentIntensityAnalyzer()` Returns a dict of sentiment percentage scores for each article.


- Sentiment Scoring features are: `neg`, `neu`, `pos`, `compound`


- The `sentiment_score` feature is extracted from the `compound` score which is a metric that calculates the sum of all the lexicon ratings which have been normalized between -1 (most extreme negative) and +1 (most extreme positive).


    - Positive sentiment: (sentiment_score = 1), (compound score >= 0.05)
    - Neutral sentiment: (sentiment_score = 0), (compound score > -0.05) and (compound score < 0.05)
    - Negative sentiment: (sentiment_score = -1), (compound score <= -0.05)

In [3]:
news_sentiment_df = sentUtils.get_sentiment_vader_scores(news_df, 'tokenized_content')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/briankalinowski/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
# only using the sentiment scoring features 
news_sentiment_df = news_sentiment_df[['sentiment_score', 'neg', 'neu', 'pos', 'compound', 'valid_score']]
news_sentiment_df.head()

Unnamed: 0,sentiment_score,neg,neu,pos,compound,valid_score
0,-1,0.123,0.764,0.113,-0.2263,0
1,-1,0.071,0.874,0.055,-0.7533,0
2,1,0.017,0.9,0.083,0.9041,0
3,1,0.253,0.472,0.275,0.095,0
4,1,0.08,0.765,0.154,0.9799,0


## Random Forest 1: Just Sentiment Scoring Features

In [5]:
# Train/Test Split
x_train, x_test, y_train, y_test = train_test_split(news_sentiment_df.drop(columns=['valid_score']), 
                                                    news_sentiment_df.valid_score, 
                                                    test_size=0.5, 
                                                    random_state=21)

In [6]:
rf_params = {'n_estimators': [100, 200, 300],
             'min_samples_split': [2, 4, 8, 10, 12, 15]
            }

rf_sent_predict = sentUtils.run_random_forest_grid_search(x_train, y_train, rf_params, x_test)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   36.0s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   43.0s
[Parallel(n_jobs=-1)]: Done  85 out of  90 | elapsed:   49.9s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   54.5s finished


Best Training Parameters: {'min_samples_split': 10, 'n_estimators': 300}
Best Training Score: 0.7921146953405018


In [7]:
sentUtils.format_classification_report(y_test, rf_sent_predict)

Unnamed: 0,precision,recall,f1-score,support
fake,0.817213,0.706847,0.758034,5857.0
real,0.786655,0.872399,0.827311,7257.0
accuracy,0.79846,0.79846,0.79846,0.79846
macro avg,0.801934,0.789623,0.792672,13114.0
weighted avg,0.800303,0.79846,0.79637,13114.0


In [8]:
sentUtils.format_confusion_matrix(y_test, rf_sent_predict)

Unnamed: 0,Predict_Fake,Predict_Real,True_Totals
True_Fake,4140,1717,5857
True_Real,926,6331,7257


## Random Forest 2: Sentiment and LDA 

- First we create a word count matrix then apply the LDA transformation to that matrix. This gives us probabilities for each of our documents belonging in each of the respective LDA topics. 



- Next we will combine the LDA topic probabilities with the sentiment scoring data from the previous model. 

In [5]:
# Run CountVectorizer transformation 
vectorized_tokens = sentUtils.get_count_vectorizer_matrix(news_df, 'tokenized_content')

# Get LDA transformed Topics df
news_lda_topics = sentUtils.get_lda_transformed_topics(vectorized_tokens)

# Combine with Senitment scoring df
news_sentiment_lda_df = pd.concat([news_lda_topics, news_sentiment_df], axis=1)
news_sentiment_lda_df.head()

Count Vectorizer Shape: (26227, 27906) 

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,sentiment_score,neg,neu,pos,compound,valid_score
0,0.002704,0.002704,0.002704,0.002703,0.002703,0.002703,0.57111,0.002703,0.002704,0.407262,-1,0.123,0.764,0.113,-0.2263,0
1,0.000633,0.000633,0.000633,0.653693,0.174755,0.000633,0.071526,0.096227,0.000633,0.000633,-1,0.071,0.874,0.055,-0.7533,0
2,0.067823,0.000991,0.00099,0.806241,0.030974,0.00099,0.089019,0.00099,0.00099,0.00099,1,0.017,0.9,0.083,0.9041,0
3,0.662286,0.054681,0.004002,0.065395,0.004,0.004,0.143301,0.054334,0.004,0.004001,1,0.253,0.472,0.275,0.095,0
4,0.000578,0.000578,0.000578,0.000579,0.000578,0.000578,0.177604,0.183452,0.000578,0.634896,1,0.08,0.765,0.154,0.9799,0


In [6]:
# Train/Test Split 
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(news_sentiment_lda_df.drop(columns=['valid_score']), 
                                                            news_sentiment_lda_df.valid_score, 
                                                            test_size=0.5, 
                                                            random_state=21)

In [7]:
rf_params = {'n_estimators': [100, 200, 300, 400],
             'min_samples_split': [2, 4, 8, 10, 12, 15]
            }

rf_lda_predict, rf_lda_probs = sentUtils.run_random_forest_grid_search(x_train_2, y_train_2, rf_params, x_test_2)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   35.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   47.8s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 118 out of 120 | elapsed:  2.7min remaining:    2.7s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.8min finished


Best Training Parameters: {'min_samples_split': 4, 'n_estimators': 300}
Best Training Score: 0.897963852665294


In [15]:
sentUtils.format_confusion_matrix(y_test_2, rf_lda_predict)

Unnamed: 0,Predict_Fake,Predict_Real,True_Totals
True_Fake,4934,923,5857
True_Real,353,6904,7257


In [16]:
sentUtils.format_classification_report(y_test_2, rf_lda_predict)

Unnamed: 0,precision,recall,f1-score,support
fake,0.933232,0.842411,0.885499,5857.0
real,0.882075,0.951357,0.915407,7257.0
accuracy,0.902699,0.902699,0.902699,0.902699
macro avg,0.907654,0.896884,0.900453,13114.0
weighted avg,0.904923,0.902699,0.902049,13114.0


## Save RF2 Results 

In [31]:
sentiment_rf_test = x_test_2
sentiment_rf_test['type'] = news_df.reindex(x_test_2.index).type
sentiment_rf_test['valid_score'] = y_test_2
sentiment_rf_test['valid_prediction'] = rf_lda_predict

sentiment_rf_test = sentiment_rf_test.reset_index(drop=True)

rf_probs_df = pd.DataFrame(rf_lda_probs, columns=['fake_prob', 'real_prob'])
sentiment_rf_test = pd.concat([sentiment_rf_test, rf_probs_df], axis=1)
sentiment_rf_test['score_abs'] = sentiment_rf_test.apply((lambda row: abs(row.fake_prob - row.real_prob)), axis=1)

In [32]:
sentiment_rf_test.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,neg,neu,pos,compound,valid_score,valid_prediction,type,fake_prob,real_prob,score_abs
0,0.079851,0.000215,0.047413,0.000215,0.000215,0.000215,0.012786,0.000215,0.613935,0.244941,...,0.083,0.761,0.156,0.9948,0,1,bs,0.498889,0.501111,0.002222
1,0.195934,0.000249,0.090281,0.148315,0.006419,0.143316,0.351773,0.000249,0.063216,0.000249,...,0.168,0.751,0.081,-0.9978,1,1,National Review,0.080611,0.919389,0.838778
2,0.028693,0.000244,0.016483,0.000244,0.103363,0.000244,0.089566,0.204958,0.013935,0.542269,...,0.061,0.828,0.11,0.988,1,1,National Review,0.031333,0.968667,0.937333
3,0.002632,0.002632,0.002632,0.002632,0.06661,0.002632,0.002632,0.002632,0.002633,0.912334,...,0.052,0.896,0.053,0.0258,0,0,bs,0.993333,0.006667,0.986667
4,0.138528,0.243296,0.021948,0.353679,0.103796,0.037006,0.000188,0.101182,0.000188,0.000188,...,0.086,0.778,0.136,0.9913,1,0,Vox,0.501651,0.498349,0.003302


In [33]:
sentiment_rf_test.to_csv('/Users/briankalinowski/Desktop/Data/sentiment_lda_test.csv', header=True, index=None)