In [1]:
# pip install syllables

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict, stopwords
from textblob import TextBlob
import syllables

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/wanxin23/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wanxin23/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
#Read raw csv
fake_df = pd.read_csv("Fake.csv")
real_df = pd.read_csv("True.csv")

# Add a new column called Class with a value of Fake = 1 , True = 0
fake_df['class'] = 1
real_df['class'] = 0

# Merge the two csv [True and Fake]
merged_df = pd.concat([fake_df, real_df], ignore_index=True)
merged_df.to_csv("merged_news.csv", index=False)
data = pd.read_csv("merged_news.csv")

data = data.iloc[0:500,:]

In [24]:
# test based functions 

def word_count(text):
    text = str(text).lower()
    text = text.replace("\r\n", ' ')
    if text == "no title":
        return 0
    else:
        return len(str(text).split(' '))

def sentence_count(text):
    sentences = nltk.sent_tokenize(text)
    return len(sentences)

def average_word_length(text):
    words = text.split()
    count = 0
    for word in words:
        count += len(word)
    if len(words) == 0:
        return 0
    else:
        return count / len(words)
        
def punctuation_count(text):
    count = 0
    for word in text:
        if word in string.punctuation:
            count += 1
    return count

def stopword_count(text):
    stopword = stopwords.words('english')
    count = 0
    for word in text.split():
        if word in stopword:
            count += 1
    return count

In [25]:
#change data type to string
data['text'] = data['text'].astype(str)
data['title'] = data['title'].astype(str)

data['text_word_count'] = data['text'].apply(word_count)
data['title_word_count'] = data['title'].apply(word_count)

data['text_sentence_count'] = data['text'].apply(sentence_count)
data['title_sentence_count'] = data['title'].apply(sentence_count)

data['text_average_word_length'] = data['text'].apply(average_word_length)
data['title_average_word_length'] = data['title'].apply(average_word_length)

data['text_punctuation_count'] = data['text'].apply(punctuation_count)
data['title_punctuation_count'] = data['title'].apply(punctuation_count)

data['text_stopwords_count'] = data['text'].apply(stopword_count)
data['title_stopwords_count'] = data['title'].apply(stopword_count)

In [29]:
#Calculating number of syllables in a word
def nsyl(word):
    return syllables.estimate(word) 

#Calculating number of syllables in a text 
def syllables_text(text):
    syllable_count = sum(list(map(lambda w: nsyl(w), word_tokenize(text))))
    return syllable_count

data['syllables'] = data['text'].apply(syllables_text)

Unnamed: 0,title,text,subject,date,class,text_word_count,title_word_count,text_sentence_count,title_sentence_count,text_average_word_length,title_average_word_length,text_punctuation_count,title_punctuation_count,text_stopwords_count,title_stopwords_count,syllables
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1,516,13,26,1,4.804040,5.583333,121,1,186,1,870
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1,309,9,11,1,5.213115,7.625000,39,0,119,0,565
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1,600,16,25,1,5.168966,5.000000,148,0,209,0,1048
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1,475,15,15,1,5.180180,4.571429,118,2,160,0,805
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1,434,12,19,1,4.554762,5.363636,40,0,195,0,688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,RNC Staffer Assigned To Finding Positive Stor...,Imagine for a moment that your job was to scou...,News,"August 25, 2017",1,336,17,11,1,4.756757,4.812500,38,0,149,0,580
496,"Trump Wants It Both Ways, But You Can’t Pass ...",Everybody s gotten used to Donald Trump contra...,News,"August 25, 2017",1,681,20,28,1,4.803653,4.000000,125,1,284,0,1147
497,"REPORT: Mueller Hot On Trump’s Trail, Has Bui...",Donald Trump is very afraid of Special Counsel...,News,"August 25, 2017",1,384,14,13,1,4.772727,5.692308,37,2,163,0,650
498,House Dem Wants GOP On Record: Stop Gov’t Spe...,"Early next month, the GOP-controlled House of ...",News,"August 24, 2017",1,441,13,12,1,5.168203,4.916667,46,1,182,0,797


In [33]:
# Flesch-Kincaid Readability Metric
def flesch_formula(word_count, sent_count, syllable_count):
    if sent_count == 0:
        return 0
    else:
        return 206.835 - 1.015*word_count/sent_count - 84.6*syllable_count/word_count
    
# Get flesch readability
data['flesch_readability'] = data.apply(lambda n: flesch_formula(n['text_word_count'],n['text_sentence_count'],n['syllables']),axis=1)
data['flesch_readability'] = (data['flesch_readability'] - data['flesch_readability'].mean()) / data['flesch_readability'].std()

In [31]:
# Calculate subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
  
# Calculate polarity 
def getPolarity(text):
    return TextBlob(text).sentiment.polarity
  
#Get subjectivity and polarity
data['subjectivity'] = data['text'].apply(getSubjectivity)
data['polarity'] = data['text'].apply(getPolarity)

In [34]:
data

Unnamed: 0,title,text,subject,date,class,text_word_count,title_word_count,text_sentence_count,title_sentence_count,text_average_word_length,title_average_word_length,text_punctuation_count,title_punctuation_count,text_stopwords_count,title_stopwords_count,syllables,flesch_readability,subjectivity,polarity
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1,516,13,26,1,4.804040,5.583333,121,1,186,1,870,0.935789,0.599895,0.082132
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1,309,9,11,1,5.213115,7.625000,39,0,119,0,565,-0.439548,0.334098,-0.005004
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1,600,16,25,1,5.168966,5.000000,148,0,209,0,1048,0.306351,0.541969,-0.012345
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1,475,15,15,1,5.180180,4.571429,118,2,160,0,805,0.078113,0.394086,-0.023118
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1,434,12,19,1,4.554762,5.363636,40,0,195,0,688,1.305330,0.495222,-0.011722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,RNC Staffer Assigned To Finding Positive Stor...,Imagine for a moment that your job was to scou...,News,"August 25, 2017",1,336,17,11,1,4.756757,4.812500,38,0,149,0,580,-0.024470,0.453976,0.189337
496,"Trump Wants It Both Ways, But You Can’t Pass ...",Everybody s gotten used to Donald Trump contra...,News,"August 25, 2017",1,681,20,28,1,4.803653,4.000000,125,1,284,0,1147,0.639842,0.485850,0.175516
497,"REPORT: Mueller Hot On Trump’s Trail, Has Bui...",Donald Trump is very afraid of Special Counsel...,News,"August 25, 2017",1,384,14,13,1,4.772727,5.692308,37,2,163,0,650,0.235176,0.484611,0.117287
498,House Dem Wants GOP On Record: Stop Gov’t Spe...,"Early next month, the GOP-controlled House of ...",News,"August 24, 2017",1,441,13,12,1,5.168203,4.916667,46,1,182,0,797,-0.910621,0.434319,0.096057
