# How do news channels affect your opinions?

This is a short case study on whether the content and sentiment of the news channels on different topics actually affect the way that you think


In [1]:
import csv
import pandas as pd
import nltk
from textblob import TextBlob 

In [4]:
# Joining the three acticles datasets
article1 = pd.read_csv("../all-the-news/articles1.csv")
article2 = pd.read_csv("../all-the-news/articles2.csv")
article3 = pd.read_csv("../all-the-news/articles3.csv")
article = article1.append(article2)
article = article.append(article3)
print(article.shape)
article.head()

(142570, 10)


Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [5]:
# The survey dataset
survey = pd.read_csv("../PRRI-2017-Kids-Wellbeing-Survey/PRRI-2017-Kids-Wellbeing-Survey.csv")
print(survey.shape)
survey.head()

(3455, 152)


Unnamed: 0.1,Unnamed: 0,CaseId,WEIGHT,SAMPLE,Q1,Q2,Q3A,Q3B,Q3C,Q3D,...,METRO,INTERNET,HOUSING,PHONESERVICE,HHSIZE,HH01,HH25,HH612,HH1317,HH18OV
0,1,4682,0.897008,Southwest,Behind us,Worse off,"No, have not","No, have not","No, have not","No, have not",...,Metro Area,Internet Household,Owned or being bought by you or someone in you...,Landline telephone only,1,0,0,0,0,1
1,2,3904,0.191948,South,Ahead of us,Better off,"Yes, have","Yes, have","No, have not","No, have not",...,Metro Area,Internet Household,Rented for cash,Cellphone only,3,1,0,0,0,2
2,3,3959,0.761054,South,Behind us,Worse off,"Yes, have","No, have not","No, have not","No, have not",...,Metro Area,Non-internet household,Owned or being bought by you or someone in you...,"Have cellphone, but mostly use landline",2,0,0,0,0,2
3,4,1295,0.439389,South,Ahead of us,Better off,"No, have not","Yes, have","No, have not","Yes, have",...,Metro Area,Internet Household,Owned or being bought by you or someone in you...,"Have cellphone, but mostly use landline",2,0,0,0,0,2
4,5,6991,1.047701,South,Ahead of us,Better off,"Yes, have","Yes, have","Yes, have","Yes, have",...,Metro Area,Internet Household,Rented for cash,Cellphone only,4,0,0,1,0,3


For the analysis, we will be taking only 4 news networks which are common to both the options provided in the survey and news article data set that we have. The 4 news agencies are 

- Fox News
- CNN 
- NPR
- New York Times 

In [82]:
common_news = ['Fox News', 'CNN', 'NPR', 'New York Times']
print("News Articles available")
print(list(set(article["publication"])))
selected_art = article[article["publication"].isin(common_news)]
selected_art.shape

fox_art = selected_art[selected_art["publication"] == 'Fox News']
cnn_art = selected_art[selected_art["publication"] == 'CNN']
npr_art = selected_art[selected_art["publication"] == 'NPR']
nyt_art = selected_art[selected_art["publication"] == 'New York Times']

common_news_sur = ['Fox News', 'CNN', 'NPR', 'The New York Times']
print("Survey options")
print(list(set(survey["Q23"])))
selected_survey = survey[survey["Q23"].isin(common_news_sur)]
selected_survey.shape

News Articles available
['Atlantic', 'Fox News', 'Talking Points Memo', 'Guardian', 'Reuters', 'Business Insider', 'CNN', 'Breitbart', 'Vox', 'Washington Post', 'Buzzfeed News', 'National Review', 'New York Post', 'New York Times', 'NPR']
Survey options
['Fox News', 'NPR', 'CNN', "Don't know?/Refused", 'Local television news/local newspapers', 'MSNBC', 'The major broadcast networks (ABC, NBC, CBS)', 'USA Today', 'The New York Times', 'The Wall Street Journal']


(1794, 152)

In [51]:
def analyse_sentiment(dataframe):
    t= 0
    nk = 0
    nm = 0
    pk = 0
    pm = 0
    for index, item in dataframe.iterrows():
        t_analysis = TextBlob(item["title"] + item["content"]).sentiment.polarity
        t = t + t_analysis
        if t_analysis > 0 :
            pk = pk + 1
            pm = pm + t_analysis
        
        elif t_analysis < 0 :
            nk = nk + 1
            nm = nm + t_analysis
    
    if(pk!= 0):
        print("Positive  = " + str(pm / pk) + "  ")
    if(nk!= 0):
        print("Negetive  = " + str(nm / nk) + "  ")
    if(pk!= 0 or nk!= 0):
        print("total  = " + str(t /(pk + nk)) + "  ")
    return(pk + nk) 
    

## 1 - Voting Patterns

We will be analysing and running the Sentiment analysis on the news articles containing the news on Donald Trump and Hillary Clinton and check the sentiment of the news agencies articles them. 

Note that the sentient is biased on the positive side so we will use sentiment comparision for both rather than the absolute sentiment scores.


In [36]:
#Voting Patterns
print("Fox")
trump_fox = fox_art[fox_art["title"].str.contains("Trump", case = False) ]
hilary_fox = fox_art[fox_art["title"].str.contains("Hillary", case = False) | fox_art["title"].str.contains("Clinton", case = False)]

analyse_sentiment(trump_fox)
analyse_sentiment(hilary_fox)

print("CNN")
trump_cnn = cnn_art[cnn_art["title"].str.contains("Trump", case = False)]
hilary_cnn = cnn_art[cnn_art["title"].str.contains("Hillary", case = False) | cnn_art["title"].str.contains("Clinton", case = False)]

analyse_sentiment(trump_cnn)
analyse_sentiment(hilary_cnn)

print("NPR")
trump_npr = npr_art[npr_art["title"].str.contains("Trump", case = False)]
hilary_npr = npr_art[npr_art["title"].str.contains("Hillary", case = False) | npr_art["title"].str.contains("Clinton", case = False)]

analyse_sentiment(trump_npr)
analyse_sentiment(hilary_npr)

print("NYT")
trump_nyt = nyt_art[nyt_art["title"].str.contains("Trump", case = False)]
hilary_nyt = nyt_art[nyt_art["title"].str.contains("Hillary", case = False) | nyt_art["title"].str.contains("Clinton", case = False)]

analyse_sentiment(trump_nyt)
analyse_sentiment(hilary_nyt)

Fox
Positive  = 0.11588111430623887  
Negetive  = -0.03644874196916791  
total  = 0.10270167000072081  
Positive  = 0.10705484161520282  
Negetive  = -0.030142738076002307  
total  = 0.09762611594217471  
CNN
Positive  = 0.0935515679290175  
Negetive  = -0.04618497872869567  
total  = 0.078270344675581  
Positive  = 0.09343842466576323  
Negetive  = -0.03724936373147333  
total  = 0.08138919594828749  
NPR
Positive  = 0.09822699432400782  
Negetive  = -0.045050000024738926  
total  = 0.0878321787280255  
Positive  = 0.11383502305630028  
Negetive  = -0.05452261809714072  
total  = 0.10353977161885966  
NYT
Positive  = 0.08927368699091766  
Negetive  = -0.026483396008473947  
total  = 0.08535678065472513  
Positive  = 0.09893440410138893  
Negetive  = -0.03359150548392078  
total  = 0.09175791080254544  


277

In [16]:
selected_survey_vote = selected_survey[~selected_survey["Q24"].isnull()][["CaseId","Q23", "Q24"]]
for channel in common_news_sur:
    k = selected_survey_vote[selected_survey_vote["Q23"] == channel][["CaseId", "Q24"]].groupby(["Q24"]).agg('count')
    trump_score = k.CaseId["Donald Trump, the Republican"]
    hilary_score = k.CaseId["Hillary Clinton, the Democrat"]
    print(channel)
    print("Trump = " + str(trump_score/(trump_score + hilary_score)))
    print("Clinton = " + str(hilary_score/(trump_score + hilary_score)))

Fox News
Trump = 0.9160714285714285
Clinton = 0.08392857142857142
CNN
Trump = 0.15172413793103448
Clinton = 0.8482758620689655
NPR
Trump = 0.12771739130434784
Clinton = 0.8722826086956522
The New York Times
Trump = 0.10294117647058823
Clinton = 0.8970588235294118


## 2- Illegal Immigration

The news corpus dataset that we have has very less news articles for NPR and New york Times with news about Illegal immigration. So the anaysis had to be done by taking only CNN and Fox News into consideration  

In [60]:
analyse_sentiment(fox_art[fox_art["title"].str.contains("illegal immigrant", case = False) | fox_art["title"].str.contains("illegal immigration", case = False)])
analyse_sentiment(cnn_art[cnn_art["title"].str.contains("illegal immigrant", case = False) | cnn_art["title"].str.contains("illegal immigration", case = False)])


Positive  = 0.046535345976162094  
Negetive  = -0.05414902209062642  
total  = -0.014993990064653117  
Positive  = 0.12489024836922441  
Negetive  = -0.018732997421522013  
total  = 0.07701583310564226  


3

In [65]:
selected_survey_immi = selected_survey[~selected_survey["Q7C"].isnull()][["CaseId","Q23", "Q7C"]]
selected_survey_immi.shape
selected_survey_immi[selected_survey_immi["Q23"] == "Fox News"][["CaseId", "Q7C"]].groupby(["Q7C"]).agg('count')

Unnamed: 0_level_0,CaseId
Q7C,Unnamed: 1_level_1
Don't know?/Refused,5
Major problem,421
Minor problem,210
Not a problem,79


In [66]:
selected_survey_immi = selected_survey[~selected_survey["Q7C"].isnull()][["CaseId","Q23", "Q7C"]]
selected_survey_immi.shape
selected_survey_immi[selected_survey_immi["Q23"] == "CNN"][["CaseId", "Q7C"]].groupby(["Q7C"]).agg('count')

Unnamed: 0_level_0,CaseId
Q7C,Unnamed: 1_level_1
Don't know?/Refused,2
Major problem,148
Minor problem,178
Not a problem,88


In [117]:
analyse_sentiment(fox_art[fox_art["title"].str.contains("obamacare", case = False)  | fox_art["title"].str.contains("Affordable Care", case = False)])
analyse_sentiment(cnn_art[cnn_art["title"].str.contains("obamacare", case = False)  | cnn_art["title"].str.contains("Affordable Care", case = False)])
analyse_sentiment(nyt_art[nyt_art["title"].str.contains("obamacare", case = False)  | nyt_art["title"].str.contains("Affordable Care", case = False)])
analyse_sentiment(npr_art[npr_art["title"].str.contains("obamacare", case = False)  | npr_art["title"].str.contains("Affordable Care", case = False)])


Positive  = 0.10240710794472911  
Negetive  = -0.04718081125961028  
total  = 0.074705641425407  
Positive  = 0.10284318666988382  
Negetive  = -0.01882057320055057  
total  = 0.09235493150863948  
Positive  = 0.10248405244500486  
total  = 0.10248405244500486  
Positive  = 0.09862146672427456  
Negetive  = -0.05039500797894654  
total  = 0.08524819335347265  


78

In [91]:
selected_survey_health = selected_survey[~selected_survey["Q8C"].isnull()][["CaseId","Q23", "Q8C"]]
selected_survey_health[selected_survey_health["Q23"] == "Fox News"][["CaseId", "Q8C"]].groupby(["Q8C"]).agg('count')
selected_survey_health = selected_survey[~selected_survey["Q8C"].isnull()][["CaseId","Q23", "Q8C"]]
selected_survey_health[selected_survey_health["Q23"] == "Fox News"][["CaseId", "Q8C"]].groupby(["Q8C"]).agg('count')
selected_survey_health = selected_survey[~selected_survey["Q8C"].isnull()][["CaseId","Q23", "Q8C"]]
selected_survey_health[selected_survey_health["Q23"] == "Fox News"][["CaseId", "Q8C"]].groupby(["Q8C"]).agg('count')


Unnamed: 0_level_0,CaseId
Q8C,Unnamed: 1_level_1
Completely agree,159
Completely disagree,54
Don't know?/Refused,5
Mostly agree,342
Mostly disagree,155


In [103]:
# Better Off Worse off
analyse_sentiment(fox_art)
analyse_sentiment(cnn_art)
analyse_sentiment(npr_art)
analyse_sentiment(nyt_art)

Positive  = 0.10258332055770698  
Negetive  = -0.05220591333251923  
total  = 0.0740150771599946  
Positive  = 0.10044889655882205  
Negetive  = -0.04651104718679257  
total  = 0.07740343265624716  
Positive  = 0.1056654389302441  
Negetive  = -0.03988751554096138  
total  = 0.09409164121551256  
Positive  = 0.09060268510038491  
Negetive  = -0.031677040353807306  
total  = 0.08182700315057752  


0.08182700315057752

In [112]:
selected_survey = survey[survey["Q23"].isin(common_news_sur) & ~survey["Q2"].isnull()][["CaseId","Q23", "Q2"]]
selected_survey.shape

print(selected_survey[selected_survey["Q23"] == "Fox News"][["CaseId", "Q2"]].groupby(["Q2"]).agg('count'))
print(selected_survey[selected_survey["Q23"] == "CNN"][["CaseId", "Q2"]].groupby(["Q2"]).agg('count'))
print(selected_survey[selected_survey["Q23"] == "NPR"][["CaseId", "Q2"]].groupby(["Q2"]).agg('count'))
print(selected_survey[selected_survey["Q23"] == "The New York Times"][["CaseId", "Q2"]].groupby(["Q2"]).agg('count'))


                     CaseId
Q2                         
Better off              371
Don't know?/Refused       7
Worse off               337
                     CaseId
Q2                         
Better off              188
Don't know?/Refused       6
Worse off               222
                     CaseId
Q2                         
Better off              225
Don't know?/Refused       1
Worse off               255
                     CaseId
Q2                         
Better off               85
Don't know?/Refused       3
Worse off                94


### Acknowledgements

- The "All-the-news" dataset was taken from the dataset published by Andrew Thompson on Kaggle https://www.kaggle.com/snapcrack/all-the-news/home
- The "PRRI-2017-Kids-Wellbeing-Survey" Dataset was taken from PRRI's data vault https://www.prri.org/data-vault/ 