In [1]:
import sys, os
import glob
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import csv
import sqlite3
import sklearn
import datetime
import json
from string import punctuation
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib


# csv file is large so the field_size_limit must be increased
# maxsize can't be converted to long because it is too large so try except decreases size until it works]

maxInt = sys.maxsize
decrement = True

while decrement:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.

    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt/10)
        decrement = True

In [2]:
def load_csv(filename):
    """Loads csv and saves data in a list of dictionaries"""
    csv_list = []
    with open(filename, encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            csv_list.append({"author": row["author"], "date": row["published"],"title": row["title"], "text":row["text"],"site_url": row["site_url"], "type": row["type"]})
    return csv_list

original_news_data = load_csv("fake.csv")
news_data = load_csv("fake.csv")
word_news_data = load_csv("fake.csv") 

def remove_if_null(data):
    """Function that removes all values if one in that dictionary is missing
    *not sure if I'll use this*
    """
    to_remove = []
    for dicts in data:
        for key, value in dicts.items():
            if(value == None):
                to_remove.append(dicts)
    for dicts in to_remove:
        data.remove(dicts)
        
def load_domains(file):
    """loads the json file of domains"""
    with open(file, 'r') as line:
        domain = json.load(line)
    return domain
        
domains = load_domains("sources.json")
  
def fix_unknown_types(data, domains):
    """replaces the unknown-bs types with the type associated with the url
    in the json file"""
    for keys, values in domains.items():
        for dicts in data:
            if(dicts["type"] == 'bs'):
                if(dicts["site_url"] == keys):
                    dicts["type"] = values["type"]
    return data
                    
news_data = fix_unknown_types(news_data, domains)

def totals(data, sum_type):
    """Returns a descending sorted tuple that holds the counts of the type of data that is passed in"""
    sum_dict = {}
    for dicts in data:
        if dicts[sum_type] in sum_dict.keys():
            sum_dict[dicts[sum_type]] += 1
        else:
            sum_dict[dicts[sum_type]] = 1
    
    sum_dict = sorted(sum_dict.items(), key=lambda t : t[1], reverse=True)
    return sum_dict

#url_sums = totals(news_data, "site_url")

#print(url_sums[:300])

def date_time(data, news_type):
    """Takes data and data type and creates a list of datetime objects for graphing number of articles over time
    valid types are: bias, bs, all...
    """
    date_list_raw = []
    date_list_cleaned = []
    
    # places dates in first, then loops through them again to clean them
    if(news_type == "all"):
        for dicts in data:
            # must replace the : because theres no native +00:00 timezone directive in strptime, only +0000
            x = dicts["date"].replace(":","")
            y = x[17:-5]
            x = x.replace(y, "")
            date_list_raw.append(x)
    else:
        for dicts in data:
            if(dicts["type"] == news_type):
                x = dicts["date"].replace(":","")
                y = x[17:-5]
                x = x.replace(y, "")
                date_list_raw.append(x)
                
    for item in date_list_raw:  
        date_list_cleaned.append(datetime.datetime.strptime(item, "%Y-%m-%dT%H%M%S%z"))
    
    return date_list_cleaned

In [3]:
to_ignore = 'in they you a not that who but were we from be their has your their than them with how the to this are as of – and if or his her an have is on what no he she by for'

def clean_words(data):
    """This function prepares the words in the 'text' and 'title' part of the data for a total word count analysis"""
    exclude = set(punctuation)
    for dicts in data:
        text_replace = [ char for char in dicts["text"] if char not in exclude ]
        title_replace = [ char for char in dicts["title"] if char not in exclude ]
        # to rejoin the individual characters
        text_replace = "".join(text_replace)
        title_replace = "".join(title_replace)
        # replaces the text in form 
        dicts["text"] = text_replace.lower()
        dicts["title"] = title_replace.lower()
    return data

def count_title_words(data):
    """Counts the total number of words in the tile part of data
    returns a sorted dictionary of words as keys and counts as values"""
    title_dict = {}
    for dicts in data:
        list_words = dicts['title'].strip().split()
        for word in list_words:
            if word in title_dict.keys() and word not in to_ignore:
                title_dict[word] += 1
            elif word != None and word not in to_ignore:
                title_dict.update({word:1})
    title_dict = sorted(title_dict.items(), key=lambda t : t[1], reverse=True)
    return title_dict

word_news_data = clean_words(word_news_data)

title_counts = count_title_words(word_news_data)
text_counts = count_text_words(word_news_data)
print("Title Counts")
for word in title_counts[:20]:
    print(word[0] + ": ", word[1])
    

NameError: name 'count_text_words' is not defined

In [None]:
full_date = date_time(news_data, "all")
import matplotlib
import matplotlib.pyplot as plt

font = {'size':16}
matplotlib.rc('font', **font)
matplotlib.rc('figure', figsize=(10, 10))

plt.hist(full_date,bins=30)
plt.xlabel("Date of Articles")
plt.ylabel("Number of Articles")
plt.title("Histogram of All Articles Over Time")
plt.savefig('all_articles.png')
plt.show()

types = ['bs', 'bias','conspiracy', 'hate','satire','state','junksci','fake']

x_label = "Date of Articles"
y_label = "Number of Articles"
title = "Histogram of Articles Over Time: "
ext = ".png"

all_data = []
all_data.append(full_date)
for each in types:
    fig_name = each + ".png"
    title = "Histogram of Articles Over Time: "
    dates = date_time(news_data, each)
    all_data.append(dates)
    plt.hist(dates,bins=30)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    title = title + each
    plt.title(title)
    plt.savefig(fig_name)
    plt.show()


In [None]:
def get_sentiment_data(filename):    
    emotions_dict = {'joy': 1,'anger': 2,'fear': 3,'sadness': 4,'guilt': 5,'shame':6,'disgust':7}
    csv_list = []
    list_1 = []
    list_2 = []
    
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            list_1.append(emotions_dict[row['emotion']])
            list_2.append(row['text'])
            
    csv_list.append(list_1)
    csv_list.append(list_2)
    return csv_list

sentiment = get_sentiment_data("isear.csv")

    
def add_sentiment(data, training_data):
    """Adds Naive Bayes sentiment analysis on the title of 
    each article, as well as the text. Returns a list of dictionaries with new
    sentiment key/value pair. 
    
    Training data is a list of lists
    """
    emotions_dict = {'joy': 1,'anger': 2,'fear': 3,'sadness': 4,'guilt': 5,'shame':6,'disgust':7}
    
    sentiment = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),])
    sentiment.fit(training_data[1],training_data[0])
    
    for dicts in data:
        to_predict_text = []
        to_predict_text.append(dicts['text'])
        to_predict_title = []
        to_predict_title.append(dicts['title'])
        predicted_text = sentiment.predict(to_predict_text)
        predicted_title = sentiment.predict(to_predict_title)
        for keys, values in emotions_dict.items():
            if values == predicted_text:
                dicts["text_sentiment"] = keys
            if values == predicted_title:
                dicts["title_sentiment"] = keys
    return data
        
new_data = add_sentiment(word_news_data, sentiment)

title_sentiment = totals(new_data, 'title_sentiment')
text_sentiment = totals(new_data, 'text_sentiment')


In [None]:
title_emotion = []
title_totals = []
title_sum = 0
for each in title_sentiment:
    title_emotion.append(each[0])
    title_totals.append(each[1]/12999)
    
text_emotion = []
text_totals = []
text_sum = 0
for each in text_sentiment:
    text_emotion.append(each[0])
    text_totals.append(each[1]/12999)
print(text_totals)
print(title_totals)
    
    
y_pos = np.arange(len(title_emotion))
 
plt.bar(y_pos, title_totals, align='center', alpha=0.5)
plt.xticks(y_pos, title_emotion)
plt.ylabel('Percent of Emotion')
plt.title('Emotions Classified to Article Title')
plt.savefig("title_emotions.png")
plt.show()

y_pos = np.arange(len(text_emotion))
 
plt.bar(y_pos, text_totals, align='center', alpha=0.5)
plt.xticks(y_pos, text_emotion)
plt.ylabel('Percent of Emotion')
plt.title('Emotions Classified to Article Text')
plt.savefig("text_emotions.png")
plt.show()