In [40]:
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
import unicodedata
import en_core_web_sm
import os
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [41]:
#not a standard Python module. Taken from DJ's Github repository
#must load from same directory as code or else will not work
from contractions import CONTRACTION_MAP


In [42]:
pwd

'C:\\Users\\adamj\\Documents\\Github\\Springboard\\NLP Projects\\CSV Files'

In [43]:
#load tokenizer
nlp = en_core_web_sm.load()
#nlp = spacy.load('en_core', parse = True, tag=True, entity=True) (DJ'S code)
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, entity=True) (DJ's code)
tokenizer = ToktokTokenizer()
##needed to use the comment out line below to get the stopwords
#nltk.download('stopwords') 
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [44]:
#function to remove accented characters
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text
#test the function with example
remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

In [45]:
#function to expand contractions

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
#example
expand_contractions("Y'all can't expand contractions I'd think")

'You all cannot expand contractions I would think'

In [46]:
#function to remove special characters

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, ' ', text)
    return text
#example
remove_special_characters("Well this was fun! What do you-think? 123#@!", 
                          remove_digits=True)

'Well this was fun  What do you think        '

In [47]:
#function to remove stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#example
remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

In [48]:
#function to keep mid-sentence words capitalized
def keep_caps(text):
    processed_sentence = ' '.join(
        [word.lower() if not word.isupper() else word for word in sentence.split()])

In [49]:
#Bringing it all together — Building a Text Normalizer (comment out html stripping; not needed)
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        #if html_stripping:
        #    doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            print(doc)
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = ' '.join([doc.lower() if not doc.isupper() else doc for doc in doc.split()])
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        #if text_lemmatization:
        #    doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [50]:
#test function on sample corpus
corpus = ['The sky is BLUE and beautiful.',
          '         Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is      quick!'    
]

In [51]:
normalize_corpus(corpus)

The sky is BLUE and beautiful.
         Love this blue and beautiful sky!
The quick brown fox jumps over the lazy dog.
A king's breakfast has sausages, ham, bacon, eggs, toast and beans
I love green eggs, ham, sausages and bacon!
The brown fox is quick and the blue dog is lazy!
The sky is very blue and the sky is very beautiful today
The dog is lazy but the brown fox is      quick!


['sky BLUE beautiful',
 'love blue beautiful sky',
 'quick brown fox jumps lazy dog',
 'A kings breakfast sausages ham bacon eggs toast beans',
 'I love green eggs ham sausages bacon',
 'brown fox quick blue dog lazy',
 'sky blue sky beautiful today',
 'dog lazy brown fox quick']

In [52]:
os.chdir('C:/Users/adamj/Documents/Github/Springboard/NLP Projects/CSV Files')

In [53]:
os.listdir()

['augmented.csv',
 'Fake.csv',
 'Fake_True_news.csv',
 'Goodreads_preprocessed.csv',
 'Goodreads_preprocessed2.csv',
 'True.csv']

In [54]:
Fake_news=open('Fake.csv','r',encoding='utf8')
Fake_news_csv=pd.read_csv(Fake_news)

In [55]:
True_news=open('True.csv','r',encoding='utf8')
True_news_csv=pd.read_csv(True_news)

In [56]:
pd.set_option('display.max_columns', None)
print(Fake_news_csv.info())
print(Fake_news_csv.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
title      23481 non-null object
text       23481 non-null object
subject    23481 non-null object
date       23481 non-null object
dtypes: object(4)
memory usage: 733.9+ KB
None
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   
5   Racist Alabama Cops Brutalize Black Boy While...   
6   Fresh Off The Golf Course, Trump Lashes Out A...   
7   Trump Said Some INSANELY Racist Stuff Inside ...   
8   Former CIA Director Slams Trump Over UN Bully...   
9   WATCH: Brand-New Pro-Trump Ad Features So Muc...   

                                                text subject  \
0  Donald Trump just couldn t wi

In [57]:
pd.set_option('display.max_columns', None)
print(True_news_csv.info())
print(True_news_csv.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
title      21417 non-null object
text       21417 non-null object
subject    21417 non-null object
date       21417 non-null object
dtypes: object(4)
memory usage: 669.4+ KB
None
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   
5  White House, Congress prepare for talks on spe...   
6  Trump says Russia probe will be fair, but time...   
7  Factbox: Trump on Twitter (Dec 29) - Approval ...   
8         Trump on Twitter (Dec 28) - Global Warming   
9  Alabama official to certify Senator-elect Jone...   

                                                text       subject  \
0  WASHINGTON (Reuters) - 

### Next, we prepare the datasets for merging (vertically). We will create a column called "real/fake" for both datasets. Then for the "True" dataset, we will insert a '0' value in all the rows. In the "Fake" dataset, we will insert a '1' value in all the rows. Then we will merge.

In [58]:
#create empty columns in 'True' and 'Fake' dataframes
True_news_csv["real/fake"] = 0
Fake_news_csv["real/fake"] = 1

In [59]:
#check work
print(Fake_news_csv.info())
print(Fake_news_csv.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 5 columns):
title        23481 non-null object
text         23481 non-null object
subject      23481 non-null object
date         23481 non-null object
real/fake    23481 non-null int64
dtypes: int64(1), object(4)
memory usage: 917.4+ KB
None
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   
5   Racist Alabama Cops Brutalize Black Boy While...   
6   Fresh Off The Golf Course, Trump Lashes Out A...   
7   Trump Said Some INSANELY Racist Stuff Inside ...   
8   Former CIA Director Slams Trump Over UN Bully...   
9   WATCH: Brand-New Pro-Trump Ad Features So Muc...   

                                            

In [60]:
print(True_news_csv.info())
print(True_news_csv.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 5 columns):
title        21417 non-null object
text         21417 non-null object
subject      21417 non-null object
date         21417 non-null object
real/fake    21417 non-null int64
dtypes: int64(1), object(4)
memory usage: 836.7+ KB
None
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   
5  White House, Congress prepare for talks on spe...   
6  Trump says Russia probe will be fair, but time...   
7  Factbox: Trump on Twitter (Dec 29) - Approval ...   
8         Trump on Twitter (Dec 28) - Global Warming   
9  Alabama official to certify Senator-elect Jone...   

                                            

In [61]:
#merge vertically
Fake_True_news = pd.concat([Fake_news_csv, True_news_csv], ignore_index=True)

In [62]:
print(Fake_True_news.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
title        44898 non-null object
text         44898 non-null object
subject      44898 non-null object
date         44898 non-null object
real/fake    44898 non-null int64
dtypes: int64(1), object(4)
memory usage: 1.7+ MB
None


In [63]:
#shuffle all rows in dataframe since we have all Fake News on top, and all True News on the bottom
pd.set_option('display.max_columns', None)
Fake_True_news_shuffled = Fake_True_news.sample(frac=1).reset_index(drop=True)
print(Fake_True_news_shuffled.head(10))

                                               title  \
0  WHOA! 8 ACTUAL QUOTES FROM HILLARY That Prove ...   
1  PULITZER PRIZE WINNING AUTHOR TONI MORRISON: “...   
2  Prosecutors will not pursue Bridgegate charges...   
3  Pope to meet head of Myanmar army, Rohingya re...   
4  U.S. aerospace industry urges Trump to help Ex...   
5  DEMOCRATS CAUGHT Paying Halfway House Patients...   
6  U.S. defense elite rally behind Trump's unusua...   
7  Vatican prepared in case of Barcelona-style at...   
8  BENGHAZI SPOKESLIAR SUSAN RICE TELLS CNN: ‘We ...   
9  Trump disbands business councils after CEOs qu...   

                                                text       subject  \
0  Hillary came out with a heavily edited TV ad y...     left-news   
1  The recipient of the Presidential Medal of Fre...     left-news   
2  (Reuters) - New Jersey prosecutors on Friday s...  politicsNews   
3  VATICAN CITY (Reuters) - Pope Francis will mee...     worldnews   
4  WASHINGTON (Reuters) - The chi

### Explore the data. 

In [64]:
#convert 'date' column from object to datetime
from datetime import datetime

Fake_True_news_shuffled["date"] = datetime.strptime('December 31, 2017', '%B %d, %Y')

In [71]:
print(Fake_True_news_shuffled.head())

                                               title  \
0  WHOA! 8 ACTUAL QUOTES FROM HILLARY That Prove ...   
1  PULITZER PRIZE WINNING AUTHOR TONI MORRISON: “...   
2  Prosecutors will not pursue Bridgegate charges...   
3  Pope to meet head of Myanmar army, Rohingya re...   
4  U.S. aerospace industry urges Trump to help Ex...   

                                                text       subject       date  \
0  Hillary came out with a heavily edited TV ad y...     left-news 2017-12-31   
1  The recipient of the Presidential Medal of Fre...     left-news 2017-12-31   
2  (Reuters) - New Jersey prosecutors on Friday s...  politicsNews 2017-12-31   
3  VATICAN CITY (Reuters) - Pope Francis will mee...     worldnews 2017-12-31   
4  WASHINGTON (Reuters) - The chief executive of ...  politicsNews 2017-12-31   

   real/fake  
0          1  
1          1  
2          0  
3          0  
4          0  


In [66]:
# Get the number of fake and real news articles
Fake_True_news_shuffled['real/fake'].groupby(Fake_True_news_shuffled['real/fake'],).count()

real/fake
0    21417
1    23481
Name: real/fake, dtype: int64

In [67]:
#get the table of counts for the subject
Fake_True_news_shuffled['subject'].groupby(Fake_True_news_shuffled['subject']).count()

subject
Government News     1570
Middle-east          778
News                9050
US_News              783
left-news           4459
politics            6841
politicsNews       11272
worldnews          10145
Name: subject, dtype: int64

In [68]:
#make a 2x2 table of 'real/fake' and 'subject'
tab = pd.crosstab(Fake_True_news_shuffled['real/fake'], Fake_True_news_shuffled['subject'])
table = sm.stats.Table(tab)
print(table.table_orig)

subject    Government News  Middle-east  News  US_News  left-news  politics  \
real/fake                                                                     
0                        0            0     0        0          0         0   
1                     1570          778  9050      783       4459      6841   

subject    politicsNews  worldnews  
real/fake                           
0                 11272      10145  
1                     0          0  


### Good to know. We won't even include that column in the training or test since it's a dead giveaway.

In [72]:
Fake_True_news_shuffled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
title        44898 non-null object
text         44898 non-null object
subject      44898 non-null object
date         44898 non-null datetime64[ns]
real/fake    44898 non-null int64
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 1.7+ MB


### Next, we want to prepare the text for analysis. There are no missing values in any columns, which is excellent. But should we not activate some of the functions? Let's look at the text of the articles again.

In [None]:
print(Fake_True_news_shuffled['text'].head(30))
print(Fake_True_news_shuffled['title'].head(30))
print(Fake_True_news_shuffled['title'].head(30))
print(Fake_True_news_shuffled['text'].head(30))

### We notice that the 'fake' articles and titles have words with all CAPS in them. This could be significant. We will alter the normalize_corpus function to keep those in all CAPS (already done as of 9/17/2020).

In [None]:
%%time

#prepare text for analysis
Fake_True_news_shuffled['title_nlp']=normalize_corpus(Fake_True_news_shuffled['title'])
Fake_True_news_shuffled['text_nlp']=normalize_corpus(Fake_True_news_shuffled['text'])

In [74]:
Fake_True_news_shuffled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 7 columns):
title        44898 non-null object
text         44898 non-null object
subject      44898 non-null object
date         44898 non-null datetime64[ns]
real/fake    44898 non-null int64
title_nlp    44898 non-null object
text_nlp     44898 non-null object
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 2.4+ MB


In [75]:
# show a sample difference
Fake_True_news_shuffled.iloc[1:10][['title', 'title_nlp']].to_dict()

{'title': {1: 'PULITZER PRIZE WINNING AUTHOR TONI MORRISON: “I want to see a cop shoot a white unarmed teenager in the back”',
  2: 'Prosecutors will not pursue Bridgegate charges against New Jersey governor',
  3: 'Pope to meet head of Myanmar army, Rohingya refugees: Vatican',
  4: 'U.S. aerospace industry urges Trump to help Ex-Im Bank resume work',
  5: 'DEMOCRATS CAUGHT Paying Halfway House Patients $300 To Vote For Hillary [VIDEO]',
  6: "U.S. defense elite rally behind Trump's unusual Pentagon pick",
  7: 'Vatican prepared in case of Barcelona-style attack: Swiss Guard chief',
  8: 'BENGHAZI SPOKESLIAR SUSAN RICE TELLS CNN: ‘We should expect’ Iran To Use Funds It Gets For Terrorist Operations [VIDEO]',
  9: 'Trump disbands business councils after CEOs quit in protest'},
 'title_nlp': {1: 'PULITZER PRIZE WINNING AUTHOR TONI MORRISON I want see cop shoot white unarmed teenager back',
  2: 'prosecutors not pursue bridgegate charges new jersey governor',
  3: 'pope meet head myanmar

In [None]:
#make a wordcloud to visualize the differences between real and fake news
Fake_news_nlp=Fake_True_news[Fake_True_news['real/fake']==1]
text = Fake_news_nlp['title_nlp'].values
wordcloud = WordCloud(
    width = 1500,
    height = 1000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (30, 20),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
#make a wordcloud to visualize the differences between real and fake news
text = Fake_news_nlp['text_nlp'].values
wordcloud = WordCloud(
    width = 1500,
    height = 1000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (30, 20),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
#make a wordcloud to visualize the differences between real and fake news
True_news_nlp=Fake_True_news[Fake_True_news['real/fake']==0]
text = True_news_nlp['title_nlp'].values
wordcloud = WordCloud(
    width = 1500,
    height = 1000,
    background_color = 'white',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (30, 20),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
#make a wordcloud to visualize the differences between real and fake news
True_news_nlp=Fake_True_news[Fake_True_news['real/fake']==0]
text = True_news_nlp['text_nlp'].values
wordcloud = WordCloud(
    width = 1500,
    height = 1000,
    background_color = 'white',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (30, 20),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [76]:
#export to csv the merged and text pre-processed data set
Fake_True_news_shuffled.to_csv("Fake_True_news_shuffled.csv",index=False)