In [4]:
# pip install syllables

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
from textblob import TextBlob
import syllables

In [39]:
#Read raw csv
fake_df = pd.read_csv("Fake.csv")
real_df = pd.read_csv("True.csv")

# Add a new column called Class with a value of Fake = 1 , True = 0
fake_df['class'] = 1
real_df['class'] = 0

# Merge the two csv [True and Fake]
merged_df = pd.concat([fake_df, real_df], ignore_index=True)
merged_df.to_csv("merged_news.csv", index=False)

In [50]:
data = pd.read_csv("merged_news.csv")
data['date'] = pd.to_datetime(data['date'], errors='coerce') 
data = data.dropna()
data.drop(columns=['subject'])

Unnamed: 0,title,text,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,2017-12-31,1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,2017-12-31,1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",2017-12-30,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",2017-12-29,1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,2017-12-25,1
...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,2017-08-22,0
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",2017-08-22,0
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,2017-08-22,0
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,2017-08-22,0


In [60]:
# Write the newly cleaned dataframe in new file
data.to_csv("merged_cleaned.csv", index=False) # Final Dataset
news = pd.read_csv('merged_cleaned.csv')

news = news.iloc[0:500,:]

In [52]:
#Calculating number of words and sentences
not_punctuation = lambda w: not (len(w)==1 and (not w.isalpha()))
get_word_count = lambda text: len(list(filter(not_punctuation, word_tokenize(text))))
get_sent_count = lambda text: len(sent_tokenize(text))

#Calculating number of syllables
def nsyl(word):
    return syllables.estimate(word) 

In [53]:
# Getting count of words, sentences, syllable 
def text_statistics(text):
  word_count = get_word_count(text)
  sent_count = get_sent_count(text)
  syllable_count = sum(list(map(lambda w: nsyl(w), word_tokenize(text))))
  return word_count, sent_count, syllable_count

In [58]:
# Calculate Flesch-Kincaid Readability Metric
def flesch_formula(word_count, sent_count, syllable_count):
    if sent_count == 0:
        return 0
    else:
        return 206.835 - 1.015*word_count/sent_count - 84.6*syllable_count/word_count
    
def flesch(text):
  word_count, sent_count, syllable_count = text_statistics(text)
  return flesch_formula(word_count, sent_count, syllable_count)

In [64]:
# Get flesch readability
news['text'] = news['text'].astype(str)
news['flesch_readability'] = news['text'].apply(flesch)

In [61]:
# Calculate subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
  
# Calculate polarity 
def getPolarity(text):
    return TextBlob(text).sentiment.polarity
  
#Get subjectivity and polarity
news['subjectivity'] = news['text'].apply(getSubjectivity)
news['polarity'] = news['text'].apply(getPolarity)