In [1]:
import pandas as pd
import re
from pathlib import Path

from tqdm import tqdm
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/aleksei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

The dataset reading is taken from https://www.kaggle.com/code/therealsampat/fake-news-detection.

In [2]:
df_fake = pd.read_csv('../data/Fake.csv')
df_true = pd.read_csv('../data/True.csv')

df_fake['class'] = 0
df_true['class'] = 1

In [3]:
df_fake = df_fake.drop_duplicates('text')
df_true = df_true.drop_duplicates('text')

In [4]:
df_fake.shape, df_true.shape

((17455, 5), (21192, 5))

In [5]:
df_merge = pd.concat([df_fake, df_true], axis=0)
df_merge.sample(10)

Unnamed: 0,title,text,subject,date,class
15529,Bodies of Argentine men killed in New York att...,BUENOS AIRES (Reuters) - The bodies of five Ar...,worldnews,"November 6, 2017",1
6117,White House defends statement on audience for ...,WASHINGTON (Reuters) - White House spokesman S...,politicsNews,"January 23, 2017",1
11248,New York's Cuomo proposes $145.3 billion budge...,"ALBANY, N.Y. (Reuters) - New York Governor And...",politicsNews,"January 13, 2016",1
14400,Macron says not in French interests for German...,PARIS (Reuters) - President Emmanuel Macron sa...,worldnews,"November 20, 2017",1
11634,"Myanmar says still working with U.N., wants a ...",YANGON (Reuters) - Myanmar wants to continue w...,worldnews,"December 21, 2017",1
3931,WATCH: Bill Maher Hilariously Thanks Trump Fo...,Bill Maher surprisingly thanked Donald Trump o...,News,"November 5, 2016",0
10677,Spy agencies say Clinton emails closely matche...,WASHINGTON (Reuters) - U.S. spy agencies have ...,politicsNews,"February 24, 2016",1
10452,Missouri Democrats filibuster proposed gay dis...,(Reuters) - Democratic senators in Missouri we...,politicsNews,"March 8, 2016",1
2944,WATCH: Trump Press Sec. Says Hispanics Aren’t...,"For the first time since 1988, Latinos will be...",News,"January 19, 2017",0
15529,OBAMA’S REVEALING LETTER TO BROTHER TELLS WHY ...,"Well, it s not like we didn t know this but it...",politics,"Jun 28, 2015",0


In [6]:
df = df_merge.drop(['title', 'subject', 'date'], axis=1)
df.isnull().sum()

text     0
class    0
dtype: int64

In [7]:
df = df.sample(frac=1)
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)

In [8]:
df['part'] = 'train'
n = len(df)
train_n = int(n * 0.8)
val_n = int(n * 0.1)
test_n = n - train_n - val_n
df.loc[(train_n < df.index) & (df.index < train_n + val_n), 'part'] = 'val'
df.loc[train_n + val_n <= df.index, 'part'] = 'test'

In [9]:
df = df.sample(frac=1)
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)

In [10]:
df.head(10)

Unnamed: 0,text,class,part
0,WASHINGTON (Reuters) - Backers of a U.S.-Russi...,1,train
1,DHAKA (Reuters) - Bangladesh and Myanmar have ...,1,train
2,Fox News went completely predictable in openin...,0,train
3,"Ben Carson, the head of the Department of Hous...",0,train
4,San Francisco 49ers quarterback Colin Kaeperni...,0,val
5,(Reuters) - Kentucky state Representative Dan ...,1,train
6,Why would this actor go to a place where viole...,0,train
7,Conservative filmmaker Dennis Michael Lynch ha...,0,train
8,Wait until you read this woman s biography and...,0,val
9,"Meanwhile, back at CNN Russia Russia Russia!Th...",0,train


In [11]:
len(df[df['part'] == 'train']), len(df[df['part'] == 'val']), len(df[df['part'] == 'test'])

(30918, 3863, 3866)

In [12]:
df.head(10).to_csv('../data/sample.csv', index=None)

In [13]:
for index, row in df.iterrows():
    if index > 10:
        break
    print(index, row['text'], row['class'], '\n')

0 WASHINGTON (Reuters) - Backers of a U.S.-Russian plan to build nuclear reactors across the Middle East bragged after the U.S. election they had backing from Donald Trump s national security adviser Michael Flynn for a project that required lifting sanctions on Russia, documents reviewed by Reuters show. The documents, which have not previously been made public, reveal new aspects of the plan, including the proposed involvement of a Russian company currently under U.S. sanctions to manufacture nuclear equipment. That company, major engineering and construction firm OMZ OAO, declined to comment. The documents do not show whether Flynn, a retired Army lieutenant general, took concrete steps to push the proposal with Trump and his aides. But they do show that Washington-based nuclear power consultancy ACU Strategic Partners believed that both Flynn, who had worked as an adviser to the firm as late as mid-2016, and Trump were firmly in its corner.  Donald Trump s election as president is 

In [14]:
example_text = df.iloc[df[df['text'] == 'Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit '].index, 0].values[0]
print(example_text)

Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit 


We need to take care about names (i.e. @jamiedupree should be treated as separate token), hash tags (#Inauguration is one token here). Also let's say that we want to keep web sites as one token (pic.twitter.com/APVtyyYote or https://t.co/1dvY5lxdKo).

In [15]:
def split_into_sentences(text):
    # so the website will not split into two separate sentences by comma:
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(?=\s|[#])')
    sentences = sentence_endings.split(text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

sentences = split_into_sentences(example_text)
for sentence in sentences:
    print(sentence)

Boos and chants of  Lock her up!
were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.
#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit


In [16]:
def split_into_words(sentences):
    # regular expression to match complex URLs, simple URLs, hashtags, Twitter handles, and words
    word_pattern = re.compile(r'pic.twitter.com/\S+|https?://\S+|www\.\S+|\#\S+|\@\w+|\b\w+\'?\w*|-?\w+\'?\w*')
    tokenized_sentences = []
    for sentence in sentences:
        words = word_pattern.findall(sentence)
        tokenized_sentences.append(words)
    return tokenized_sentences

tokenized = split_into_words(sentences)
for tokens in tokenized:
    print(tokens)

['Boos', 'and', 'chants', 'of', 'Lock', 'her', 'up']
['were', 'heard', 'in', 'the', 'crowd', 'assembled', 'at', 'the', 'West', 'Front', 'of', 'the', 'U', 'S', 'Capitol', 'Friday', 'morning', 'when', 'defeated', 'Democratic', 'Party', 'presidential', 'nominee', 'Hillary', 'Clinton', 'was', 'introduced', 'at', 'the', 'inaugural', 'ceremony', 'for', 'President', '-elect', 'Donald', 'Trump']
['#InaugurationDay', 'Lock', 'her', 'up', 'pic.twitter.com/APVtyyYote', 'Bill', 'Simms', '@Mittens1245', 'January', '20', '2017The', 'crowd', 'on', 'the', 'mall', 'booed', 'when', 'the', 'jumbotron', 'showed', 'a', 'close', '-up', 'shot', 'of', 'Hillary', 'Clinton', 'at', '#Inauguration', 'https://t.co/1dvY5lxdKo', 'gpbnews', '@gpbnews', 'January', '20', '2017Some', 'in', 'crowd', 'chanting', 'LOCK', 'HER', 'UP', 'as', 'Hillary', 'Clinton', 'arrives', 'Jamie', 'Dupree', '@jamiedupree', 'January', '20', '2017Via', 'Gateway', 'Pundit']


In [17]:
def stem_words(tokenized_sentences):
    stemmer = SnowballStemmer("english")
    stemmed_sentences = []
    for tokens in tokenized_sentences:
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        stemmed_sentences.append(stemmed_tokens)
    return stemmed_sentences

stemmed = stem_words(tokenized)
for s in stemmed:
    print(s)

['boo', 'and', 'chant', 'of', 'lock', 'her', 'up']
['were', 'heard', 'in', 'the', 'crowd', 'assembl', 'at', 'the', 'west', 'front', 'of', 'the', 'u', 's', 'capitol', 'friday', 'morn', 'when', 'defeat', 'democrat', 'parti', 'presidenti', 'nomine', 'hillari', 'clinton', 'was', 'introduc', 'at', 'the', 'inaugur', 'ceremoni', 'for', 'presid', '-elect', 'donald', 'trump']
['#inaugurationday', 'lock', 'her', 'up', 'pic.twitter.com/apvtyyyot', 'bill', 'simm', '@mittens1245', 'januari', '20', '2017the', 'crowd', 'on', 'the', 'mall', 'boo', 'when', 'the', 'jumbotron', 'show', 'a', 'close', '-up', 'shot', 'of', 'hillari', 'clinton', 'at', '#inaugur', 'https://t.co/1dvy5lxdko', 'gpbnew', '@gpbnew', 'januari', '20', '2017some', 'in', 'crowd', 'chant', 'lock', 'her', 'up', 'as', 'hillari', 'clinton', 'arriv', 'jami', 'dupre', '@jamiedupre', 'januari', '20', '2017via', 'gateway', 'pundit']


In [18]:
def lemmatize_tokens(tokenized_sentences):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentences = []
    for tokens in tokenized_sentences:
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        lemmatized_sentences.append(lemmatized_tokens)
    return lemmatized_sentences

lemmatized = lemmatize_tokens(tokenized)
for l in lemmatized:
    print(l)

['Boos', 'and', 'chant', 'of', 'Lock', 'her', 'up']
['were', 'heard', 'in', 'the', 'crowd', 'assembled', 'at', 'the', 'West', 'Front', 'of', 'the', 'U', 'S', 'Capitol', 'Friday', 'morning', 'when', 'defeated', 'Democratic', 'Party', 'presidential', 'nominee', 'Hillary', 'Clinton', 'wa', 'introduced', 'at', 'the', 'inaugural', 'ceremony', 'for', 'President', '-elect', 'Donald', 'Trump']
['#InaugurationDay', 'Lock', 'her', 'up', 'pic.twitter.com/APVtyyYote', 'Bill', 'Simms', '@Mittens1245', 'January', '20', '2017The', 'crowd', 'on', 'the', 'mall', 'booed', 'when', 'the', 'jumbotron', 'showed', 'a', 'close', '-up', 'shot', 'of', 'Hillary', 'Clinton', 'at', '#Inauguration', 'https://t.co/1dvY5lxdKo', 'gpbnews', '@gpbnews', 'January', '20', '2017Some', 'in', 'crowd', 'chanting', 'LOCK', 'HER', 'UP', 'a', 'Hillary', 'Clinton', 'arrives', 'Jamie', 'Dupree', '@jamiedupree', 'January', '20', '2017Via', 'Gateway', 'Pundit']


In [19]:
def process_text(text):
    sentences = split_into_sentences(text)
    tokenized = split_into_words(sentences)
    stemmed = stem_words(tokenized)
    lemmatized = lemmatize_tokens(tokenized)
    data = []
    for i in range(len(tokenized)):
        for j in range(len(tokenized[i])):
            row = {
                'Token': tokenized[i][j],
                'Stem': stemmed[i][j],
                'Lemma': lemmatized[i][j]
            }
            data.append(row)
    df = pd.DataFrame(data)
    return df

In [20]:
df.head()

Unnamed: 0,text,class,part
0,WASHINGTON (Reuters) - Backers of a U.S.-Russi...,1,train
1,DHAKA (Reuters) - Bangladesh and Myanmar have ...,1,train
2,Fox News went completely predictable in openin...,0,train
3,"Ben Carson, the head of the Department of Hous...",0,train
4,San Francisco 49ers quarterback Colin Kaeperni...,0,val


In [21]:
def write_dataset(df, part):
    for index, row in tqdm(df.iterrows(), total=len(df)):
        text = row['text']
        class_ = 'true' if row['class'] else 'fake'
        dir = f'../assets/annotated-corpus/{part}/{class_}'
        Path(dir).mkdir(parents=True, exist_ok=True)
        path = dir / Path(str(index) + '.tsv')
        df_ = process_text(text)
        df_.to_csv(path, index=None, sep='\t')

In [22]:
%%time
for part in ['train', 'val', 'test']:
    write_dataset(df[df['part'] == part], part)

  0%|          | 0/30918 [00:00<?, ?it/s]

100%|██████████| 30918/30918 [04:00<00:00, 128.77it/s]
100%|██████████| 3863/3863 [00:32<00:00, 119.87it/s]
100%|██████████| 3866/3866 [00:36<00:00, 105.87it/s]

CPU times: user 4min 41s, sys: 15.3 s, total: 4min 56s
Wall time: 5min 8s



