In [8]:
import string
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
# Import dataframe
df = pd.read_csv('./dataset/fake_real_news/news.csv')
df

Unnamed: 0,title,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,0
...,...,...,...
44273,State Department says it can't find emails fro...,The State Department told the Republican Natio...,1
44274,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,0
44275,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,0
44276,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",1


In [3]:
df.shape

(44278, 3)

# Text Processing

### Remove URLs

In [4]:
# Remove urls
df['text'] = df['text'].apply(lambda t: re.sub(r'http\S+', '', str(t)))
df['title'] = df['title'].apply(lambda t: re.sub(r'http\S+', '', str(t)))
(df['text'], df['title'])

(0        Donald Trump just couldn t wish all Americans ...
 1        House Intelligence Committee Chairman Devin Nu...
 2        On Friday, it was revealed that former Milwauk...
 3        On Christmas day, Donald Trump announced that ...
 4        Pope Francis used his annual Christmas Day mes...
                                ...                        
 44273    The State Department told the Republican Natio...
 44274    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
 44275     Anti-Trump Protesters Are Tools of the Oligar...
 44276    ADDIS ABABA, Ethiopia —President Obama convene...
 44277    Jeb Bush Is Suddenly Attacking Trump. Here's W...
 Name: text, Length: 44278, dtype: object,
 0         Donald Trump Sends Out Embarrassing New Year’...
 1         Drunk Bragging Trump Staffer Started Russian ...
 2         Sheriff David Clarke Becomes An Internet Joke...
 3         Trump Is So Obsessed He Even Has Obama’s Name...
 4         Pope Francis Just Called Out Donald Trump Dur.

I kept the "VIDEO" thing because I think that it could be helpful since the presence of some kind of external resource may be a sign of real facts.

### Tokenize

In [5]:
def tokenize(text):
    return word_tokenize(text)


df['text'] = df['text'].apply(tokenize)
df['title'] = df['title'].apply(tokenize)

(df['text'], df['title'])

(0        [Donald, Trump, just, couldn, t, wish, all, Am...
 1        [House, Intelligence, Committee, Chairman, Dev...
 2        [On, Friday, ,, it, was, revealed, that, forme...
 3        [On, Christmas, day, ,, Donald, Trump, announc...
 4        [Pope, Francis, used, his, annual, Christmas, ...
                                ...                        
 44273    [The, State, Department, told, the, Republican...
 44274    [The, ‘, P, ’, in, PBS, Should, Stand, for, ‘,...
 44275    [Anti-Trump, Protesters, Are, Tools, of, the, ...
 44276    [ADDIS, ABABA, ,, Ethiopia, —President, Obama,...
 44277    [Jeb, Bush, Is, Suddenly, Attacking, Trump, .,...
 Name: text, Length: 44278, dtype: object,
 0        [Donald, Trump, Sends, Out, Embarrassing, New,...
 1        [Drunk, Bragging, Trump, Staffer, Started, Rus...
 2        [Sheriff, David, Clarke, Becomes, An, Internet...
 3        [Trump, Is, So, Obsessed, He, Even, Has, Obama...
 4        [Pope, Francis, Just, Called, Out, Donald, Tru.

### Stopwords and Punctuation

In [6]:
stop_words = set(stopwords.words('english'))
punctuation = list(string.punctuation)
# Add punctuation and specific characters to the stopwords
stop_words.update(punctuation)
stop_words.update('‘')
stop_words.update('’')
# stop_words.update('“')
# stop_words.update('”')

def clean(text):
    return [w for w in text if w.strip().lower() not in stop_words]

df['text'] = df['text'].apply(clean)
df['title'] = df['title'].apply(clean)

(df['text'], df['title'])

(0        [Donald, Trump, wish, Americans, Happy, New, Y...
 1        [House, Intelligence, Committee, Chairman, Dev...
 2        [Friday, revealed, former, Milwaukee, Sheriff,...
 3        [Christmas, day, Donald, Trump, announced, wou...
 4        [Pope, Francis, used, annual, Christmas, Day, ...
                                ...                        
 44273    [State, Department, told, Republican, National...
 44274    [P, PBS, Stand, Plutocratic, Pentagon, Posted,...
 44275    [Anti-Trump, Protesters, Tools, Oligarchy, Re...
 44276    [ADDIS, ABABA, Ethiopia, —President, Obama, co...
 44277    [Jeb, Bush, Suddenly, Attacking, Trump, 's, Ma...
 Name: text, Length: 44278, dtype: object,
 0        [Donald, Trump, Sends, Embarrassing, New, Year...
 1        [Drunk, Bragging, Trump, Staffer, Started, Rus...
 2        [Sheriff, David, Clarke, Becomes, Internet, Jo...
 3        [Trump, Obsessed, Even, Obama, Name, Coded, We...
 4        [Pope, Francis, Called, Donald, Trump, Christm.

## Export the model

In [7]:
df.to_csv('./dataset/news_processed.csv', index=False)