## Libraries Used

In [1]:
import csv
from collections import Counter
from collections import defaultdict
import pandas as pd
import nltk
#nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.util import ngrams

## Reading .csv files

In [2]:
# Reading True.csv
data_true = pd.read_csv('True.csv', error_bad_lines=False);
data_true_text = data_true[['text']]
data_true_text['index'] = data_true_text.index
documents_true = data_true_text
print(list(data_true))
print(len(documents_true))

# Reading Fake.csv
data_fake = pd.read_csv('Fake.csv', error_bad_lines=False);
data_fake_text = data_fake[['text']]
data_fake_text['index'] = data_fake_text.index
documents_fake = data_fake_text
print(list(data_fake))
print(len(documents_fake))

['title', 'text']
4
['title', 'text']
6


In [3]:
# First 10 true news articles
documents_true[:10].style.hide_index()

text,index
I have a very pretty sister,0
I have a very pretty mother,1
I have a handsome father,2
I have a handsome brother,3


In [4]:
# First 10 fake news articles
documents_fake[:10].style.hide_index()

text,index
I am a strong boy,0
I am a strong girl,1
I am a weak boy,2
I am a weak entity,3
,4
I am proud of my nation,5


## Data Preprocessing

In [5]:
wnlem = WordNetLemmatizer()
stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))

#### Lemmatize Example

In [6]:
past_future_forms = ['had', 'gone', 'gave', 'handled', 'died', 'cared', 'done', 'wants']
present_form = [wnlem.lemmatize(pas_fut, pos='v') for pas_fut in past_future_forms]
pd.DataFrame(data = {'Past/Future forms': past_future_forms, 'Present form': present_form}).style.hide_index()

Past/Future forms,Present form
had,have
gone,go
gave,give
handled,handle
died,die
cared,care
done,do
wants,want


#### Stemmer Example

In [7]:
original_words = ["Ram's", "Sham's", 'caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles}).style.hide_index()

original word,stemmed
Ram's,ram
Sham's,sham
caresses,caress
flies,fli
dies,die
mules,mule
denied,deni
died,die
agreed,agre
owned,own


#### Tokenize and Removing Stop Words Example

In [8]:
example_sent = '"This is a sample sentence, showing off the stop words filteration."'
print(example_sent)
print('\nThe stop words in nltk are as follows:\n',stop_words)
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
print('\nThe word tokens in example sentence are:',word_tokens)
print('\nThe filtered sentence corresponding to example sentence is:',filtered_sentence)

"This is a sample sentence, showing off the stop words filteration."

The stop words in nltk are as follows:
 {"it's", 'once', 'not', 't', 'again', 'such', 'an', 'do', "weren't", 'when', 'him', 'doing', 'or', 'because', 'further', 'its', 'both', 'is', 'haven', 'as', "she's", 'has', 'does', 'if', 'isn', 'with', 'own', 'won', 'which', 'no', 'the', 'there', 'them', 'down', "that'll", 'in', 'are', "don't", 'be', 'some', "needn't", 'herself', 'yourself', 'of', 'yourselves', 'y', 'needn', 'having', "should've", 'they', 'your', 'didn', 'itself', 'she', "shouldn't", 'theirs', 'my', 'd', 'at', 'their', 'we', 'our', 're', "you'd", 'during', 'you', 'out', 'just', 'here', 'to', 'from', 'ain', 'shan', 'am', 'himself', 'ma', 'he', 'm', 'that', 'and', 'before', 'ours', 'i', "hasn't", "wasn't", 'above', 'whom', 'then', 'where', 'very', "mustn't", 'each', 'should', "couldn't", 'off', 'myself', 'why', 'ourselves', 'mightn', 'through', 'been', 'shouldn', 'yours', 'being', 'over', "haven't", 'mustn', 'did

### Lemmatization, Stemming, Tokenization and Removing Stop Words

In [9]:
def lemmatize_stemming(text):
    return stemmer.stem(wnlem.lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    try:
        text = text[:300]   # For summarisation of news articles
        for token in word_tokenize(text):
            if token not in stop_words and len(token) > 2:
                result.append(lemmatize_stemming(token))
    except:
        pass  # for empty entries
    return result

### Preprocessing on Sample Document(s)

In [10]:
# Sample document from True.csv
doc_sample_true = documents_true[documents_true['index'] == 2].values[0][0]

print('original true sample document: ')
words_true = []
try:
    for word in doc_sample_true.split(' '):
        words_true.append(word)
except:
    pass
print(words_true)
print('\n\n preprocessed true sample document: ')
print(preprocess(doc_sample_true))

# Sample document from Fake.csv
doc_sample_fake = documents_fake[documents_fake['index'] == 4].values[0][0]

print('\n\n original fake sample document: ')
words_fake = []
try:
    for word in doc_sample_fake.split(' '):
        words_fake.append(word)
except:
    pass
print(words_fake)
print('\n\n preprocessed fake sample document: ')
print(preprocess(doc_sample_fake))

original true sample document: 
['I', 'have', 'a', 'handsome', 'father']


 preprocessed true sample document: 
['handsom', 'father']


 original fake sample document: 
[]


 preprocessed fake sample document: 
[]


In [11]:
# Preprocessing for true news articles
processed_true_docs = documents_true['text'].map(preprocess)

# Preprocessing for fake news articles
processed_fake_docs = documents_fake['text'].map(preprocess)

In [12]:
# First 10 preprocessed true news articles
print(processed_true_docs[:10])

# First 10 preprocessed fake news articles
processed_fake_docs[:10]

0      [pretti, sister]
1      [pretti, mother]
2     [handsom, father]
3    [handsom, brother]
Name: text, dtype: object


0      [strong, boy]
1     [strong, girl]
2        [weak, boy]
3     [weak, entiti]
4                 []
5    [proud, nation]
Name: text, dtype: object

# Case Study Unigram Word Frequecies and Document Frequencies

## Word Frequency

In [13]:
# Calculating Word Frequency for words from true news articles
words =[]
with open('True.csv',encoding = "utf8") as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        csv_words = preprocess(row[1])
        for i in csv_words:
            words.append(i)
            
words_counted = []

for i in words:
    x = words.count(i)
    words_counted.append((i,x))
words_counted = set(words_counted)
words_counted = list(words_counted)
words_counted.sort(key = lambda x:x[1], reverse = True)
val = [x for (i,x) in words_counted]
print('{} words were found in the corpus'.format(sum(val)))
print("The unique identified words with their corresponding word frequencies for top 50 words from true news articles are given below:")
N = 0
for (i,x) in words_counted:
    print('\t',i,'  ({})'.format(x))
    N+=1
    if N == 50:
        break

# Calculating Word Frequency for words from fake news articles
words =[]
with open('Fake.csv',encoding = "utf8") as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        csv_words = preprocess(row[1])
        for i in csv_words:
            words.append(i)
            
words_counted = []

for i in words:
    x = words.count(i)
    words_counted.append((i,x))
words_counted = set(words_counted)
words_counted = list(words_counted)
words_counted.sort(key = lambda x:x[1], reverse = True)
val = [x for (i,x) in words_counted]
print('\n \n{} words were found in the corpus'.format(sum(val)))
print("The unique identified words with their corresponding word frequencies for top 50 words from fake news articles are given below:")
N = 0
for (i,x) in words_counted:
    print('\t',i,'  ({})'.format(x))
    N+=1
    if N == 50:
        break

8 words were found in the corpus
The unique identified words with their corresponding word frequencies for top 50 words from true news articles are given below:
	 handsom   (2)
	 pretti   (2)
	 brother   (1)
	 father   (1)
	 mother   (1)
	 sister   (1)

 
10 words were found in the corpus
The unique identified words with their corresponding word frequencies for top 50 words from fake news articles are given below:
	 weak   (2)
	 strong   (2)
	 boy   (2)
	 entiti   (1)
	 proud   (1)
	 girl   (1)
	 nation   (1)


## Cumulative Term Frequency and Document Frequency

In [14]:
# Calculation of Term Frequencies and Document Frequecies for words from true news articles
with open('True.csv',encoding = "utf8") as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    words_true_ctf_dict = {}
    words_true_df_dict = {}
    l = 0
    for row in reader:
        l += 1
        csv_words = preprocess(row[1])
        csv_words_tf = []
        for i in csv_words:
            x = csv_words.count(i)
            csv_words_tf.append((i,x/float(len(csv_words))))
        csv_words_tf = set(csv_words_tf)
        for (i,x) in csv_words_tf:
            words_true_ctf_dict[i] = words_true_ctf_dict.get(i,0) + x
            words_true_df_dict[i] = words_true_df_dict.get(i,0) + 1
    for k, v in words_true_df_dict.items():
        words_true_df_dict[k] = v/(float(l))
    words_true_ctf_lst = [(k, v) for k, v in words_true_ctf_dict.items()]
    ctf_true_lst = words_true_ctf_lst[:35]
    words_true_ctf_lst.sort(key = lambda x:x[1], reverse = True)
    ctf_true_lst.sort(key = lambda x:x[1], reverse = True)
    words_true_df_lst = [(k, v) for k, v in words_true_df_dict.items()]
    df_true_lst = words_true_df_lst[:35]
    words_true_df_lst.sort(key = lambda x:x[1], reverse = True)
    df_true_lst.sort(key = lambda x:x[1], reverse = True)
    ctf_true = pd.DataFrame({'Word': [k for (k,v) in ctf_true_lst], 'Cumulative Term Frequency': [v for (k,v) in ctf_true_lst], 'Document Frequency': [words_true_df_dict[k] for (k,v) in ctf_true_lst]})
    df_true = pd.DataFrame({'Word': [k for (k,v) in df_true_lst], 'Cumulative Term Frequency': [words_true_ctf_dict[k] for (k,v) in df_true_lst], 'Document Frequency': [v for (k,v) in df_true_lst]})
    
# Calculation of Term Frequencies and Document Frequecies for words from fake news articles
with open('Fake.csv',encoding = "utf8") as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    words_fake_ctf_dict = {}
    words_fake_df_dict = {}
    l = 0
    for row in reader:
        l += 1
        csv_words = preprocess(row[1])
        csv_words_tf = []
        for i in csv_words:
            x = csv_words.count(i)
            csv_words_tf.append((i,x/float(len(csv_words))))
        csv_words_tf = set(csv_words_tf)
        for (i,x) in csv_words_tf:
            words_fake_ctf_dict[i] = words_fake_ctf_dict.get(i,0) + x
            words_fake_df_dict[i] = words_fake_df_dict.get(i,0) + 1
    for k, v in words_fake_df_dict.items():
        words_fake_df_dict[k] = v/(float(l))
    words_fake_ctf_lst = [(k, v) for k, v in words_fake_ctf_dict.items()]
    ctf_fake_lst = words_fake_ctf_lst[:35]
    words_fake_ctf_lst.sort(key = lambda x:x[1], reverse = True)
    ctf_fake_lst.sort(key = lambda x:x[1], reverse = True)
    words_fake_df_lst = [(k, v) for k, v in words_fake_df_dict.items()]
    df_fake_lst = words_fake_df_lst[:35]
    words_fake_df_lst.sort(key = lambda x:x[1], reverse = True)
    df_fake_lst.sort(key = lambda x:x[1], reverse = True)
    ctf_fake = pd.DataFrame({'Word': [k for (k,v) in ctf_fake_lst], 'Cumulative Term Frequency': [v for (k,v) in ctf_fake_lst], 'Document Frequency': [words_fake_df_dict[k] for (k,v) in ctf_fake_lst]})
    df_fake = pd.DataFrame({'Word': [k for (k,v) in df_fake_lst], 'Cumulative Term Frequency': [words_fake_ctf_dict[k] for (k,v) in df_fake_lst], 'Document Frequency': [v for (k,v) in df_fake_lst]})

In [15]:
# df1.style.apply(lambda x: ['background: red' if x < 0 else "background: green" for x in df1], axis = 0)
print("The cumulative Term Frequency in Descending Order for true news articles is given by:")
ctf_true.style.hide_index()

The cumulative Term Frequency in Descending Order for true news articles is given by:


Word,Cumulative Term Frequency,Document Frequency
pretti,1.0,0.5
handsom,1.0,0.5
sister,0.5,0.25
mother,0.5,0.25
father,0.5,0.25
brother,0.5,0.25


In [16]:
print("The Document Frequency in Descending Order for true news articles is given by:")
df_true.style.hide_index()

The Document Frequency in Descending Order for true news articles is given by:


Word,Cumulative Term Frequency,Document Frequency
pretti,1.0,0.5
handsom,1.0,0.5
sister,0.5,0.25
mother,0.5,0.25
father,0.5,0.25
brother,0.5,0.25


In [17]:
print("The cumulative Term Frequency in Descending Order for fake news articles is given by:")
ctf_fake.style.hide_index()

The cumulative Term Frequency in Descending Order for fake news articles is given by:


Word,Cumulative Term Frequency,Document Frequency
boy,1.0,0.333333
strong,1.0,0.333333
weak,1.0,0.333333
girl,0.5,0.166667
entiti,0.5,0.166667
proud,0.5,0.166667
nation,0.5,0.166667


In [18]:
print("The Document Frequency in Descending Order for fake news articles is given by:")
df_fake.style.hide_index()

The Document Frequency in Descending Order for fake news articles is given by:


Word,Cumulative Term Frequency,Document Frequency
boy,1.0,0.333333
strong,1.0,0.333333
weak,1.0,0.333333
girl,0.5,0.166667
entiti,0.5,0.166667
proud,0.5,0.166667
nation,0.5,0.166667


# Net Cumulative Term Frequency and Net Document Frequency

In [19]:
words_net_ctf_dict = {}
words_net_df_dict = {}
for k, v in words_true_ctf_dict.items():
    words_net_ctf_dict[k] = words_net_ctf_dict.get(k,0) + words_true_ctf_dict[k]
for k, v in words_fake_ctf_dict.items():
    words_net_ctf_dict[k] = words_net_ctf_dict.get(k,0) - words_fake_ctf_dict[k]
for k, v in words_true_df_dict.items():
    words_net_df_dict[k] = words_net_df_dict.get(k,0) + words_true_df_dict[k]
for k, v in words_fake_df_dict.items():
    words_net_df_dict[k] = words_net_df_dict.get(k,0) - words_fake_df_dict[k]
des_nctf_lst = [(k, v) for k, v in words_net_ctf_dict.items()][:35]
des_nctf_lst.sort(key = lambda x:x[1], reverse = True)
des_ndf_lst = [(k, v) for k, v in words_net_df_dict.items()][:35]
des_ndf_lst.sort(key = lambda x:x[1], reverse = True)
des_net_ctf = pd.DataFrame({'Word': [k for (k,v) in des_nctf_lst], 'Net Cumulative Term Frequency': [v for (k,v) in des_nctf_lst], 'Net Document Frequency': [words_net_df_dict[k] for (k,v) in des_nctf_lst]})
des_net_df = pd.DataFrame({'Word': [k for (k,v) in des_ndf_lst], 'Net Cumulative Term Frequency': [words_net_ctf_dict[k] for (k,v) in des_ndf_lst], 'Net Document Frequency': [v for (k,v) in des_ndf_lst]})

In [20]:
print("The Net cumulative Term Frequency in Descending Order is given by:")
des_net_ctf.style.hide_index()

The Net cumulative Term Frequency in Descending Order is given by:


Word,Net Cumulative Term Frequency,Net Document Frequency
pretti,1.0,0.5
handsom,1.0,0.5
sister,0.5,0.25
mother,0.5,0.25
father,0.5,0.25
brother,0.5,0.25
girl,-0.5,-0.166667
entiti,-0.5,-0.166667
proud,-0.5,-0.166667
nation,-0.5,-0.166667


In [21]:
print("The Net Document Frequency in Descending Order is given by:")
des_net_df.style.hide_index()

The Net Document Frequency in Descending Order is given by:


Word,Net Cumulative Term Frequency,Net Document Frequency
pretti,1.0,0.5
handsom,1.0,0.5
sister,0.5,0.25
mother,0.5,0.25
father,0.5,0.25
brother,0.5,0.25
girl,-0.5,-0.166667
entiti,-0.5,-0.166667
proud,-0.5,-0.166667
nation,-0.5,-0.166667


In [22]:
asc_nctf_lst = [(k, v) for k, v in words_net_ctf_dict.items()][-35:]
asc_nctf_lst.sort(key = lambda x:x[1], reverse = True)
asc_ndf_lst = [(k, v) for k, v in words_net_df_dict.items()][-35:]
asc_ndf_lst.sort(key = lambda x:x[1], reverse = True)
asc_net_ctf = pd.DataFrame({'Word': [k for (k,v) in asc_nctf_lst], 'Net Cumulative Term Frequency': [v for (k,v) in asc_nctf_lst], 'Net Document Frequency': [words_net_df_dict[k] for (k,v) in asc_nctf_lst]})
asc_net_df = pd.DataFrame({'Word': [k for (k,v) in asc_ndf_lst], 'Net Cumulative Term Frequency': [words_net_ctf_dict[k] for (k,v) in asc_ndf_lst], 'Net Document Frequency': [v for (k,v) in asc_ndf_lst]})

In [23]:
print("The Net cumulative Term Frequency in Ascending Order is given by:")
asc_net_ctf.style.hide_index()

The Net cumulative Term Frequency in Ascending Order is given by:


Word,Net Cumulative Term Frequency,Net Document Frequency
pretti,1.0,0.5
handsom,1.0,0.5
sister,0.5,0.25
mother,0.5,0.25
father,0.5,0.25
brother,0.5,0.25
girl,-0.5,-0.166667
entiti,-0.5,-0.166667
proud,-0.5,-0.166667
nation,-0.5,-0.166667


In [24]:
print("The Net Document Frequency in Ascending Order is given by:")
asc_net_df.style.hide_index()

The Net Document Frequency in Ascending Order is given by:


Word,Net Cumulative Term Frequency,Net Document Frequency
pretti,1.0,0.5
handsom,1.0,0.5
sister,0.5,0.25
mother,0.5,0.25
father,0.5,0.25
brother,0.5,0.25
girl,-0.5,-0.166667
entiti,-0.5,-0.166667
proud,-0.5,-0.166667
nation,-0.5,-0.166667
