# Laboratory work #1 (text segmentation and annotation)

In [1]:
import pandas as pd
import re
from pathlib import Path

from tqdm import tqdm
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/aleksei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

The dataset reading is taken from https://www.kaggle.com/code/therealsampat/fake-news-detection.

In [2]:
df_fake = pd.read_csv('../data/Fake.csv')
df_true = pd.read_csv('../data/True.csv')

df_fake['class'] = 0
df_true['class'] = 1

In [3]:
df_fake = df_fake.drop_duplicates('text')
df_true = df_true.drop_duplicates('text')

In [4]:
df_fake.shape, df_true.shape

((17455, 5), (21192, 5))

In [5]:
df_merge = pd.concat([df_fake, df_true], axis=0)
df_merge.sample(10)

Unnamed: 0,title,text,subject,date,class
7303,Obama officials work against time to wrap bank...,WASHINGTON (Reuters) - U.S. officials are stri...,politicsNews,"November 13, 2016",1
3346,Bill Clinton HUMILIATES Trump During Surprise...,Bill Clinton finally spoke up about Donald Tru...,News,"December 19, 2016",0
17480,German minister upsets fellow conservatives ov...,BERLIN (Reuters) - German Interior Minister Th...,worldnews,"October 14, 2017",1
4241,Trump administration drops North Carolina 'bat...,(Reuters) - The Trump administration on Friday...,politicsNews,"April 14, 2017",1
17860,Philippines doctor linked to New York attack p...,MANILA (Reuters) - A Filipino accused by the U...,worldnews,"October 10, 2017",1
11523,AWESOME PHOTOS CAPTURE A “Badass” Trump And Hi...,While visiting the USS Gerald R. Ford today do...,politics,"Mar 2, 2017",0
22209,Flynn’s Out: Is ‘The New Détente’ Really Dead ...,Andrew Korybko 21st Century Wire The global m...,US_News,"February 15, 2017",0
12707,DRUDGE THREATENS HILLARY…He’s About To Drop Bo...,Poor old Crooked Hillary the hits are coming f...,politics,"Oct 17, 2016",0
11319,Australian woman escapes the noose for drug tr...,KUALA LUMPUR (Reuters) - An Australian mother ...,worldnews,"December 27, 2017",1
8564,White House candidate Clinton sees big boost i...,WASHINGTON (Reuters) - U.S. Democratic preside...,politicsNews,"August 2, 2016",1


In [6]:
df = df_merge.drop(['title', 'subject', 'date'], axis=1)
df.isnull().sum()

text     0
class    0
dtype: int64

In [7]:
df = df.sample(frac=1)
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)

In [8]:
df['part'] = 'train'
n = len(df)
train_n = int(n * 0.8)
val_n = int(n * 0.1)
test_n = n - train_n - val_n
df.loc[(train_n < df.index) & (df.index < train_n + val_n), 'part'] = 'val'
df.loc[train_n + val_n <= df.index, 'part'] = 'test'

In [9]:
df = df.sample(frac=1)
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)

In [10]:
df.head(10)

Unnamed: 0,text,class,part
0,WASHINGTON (Reuters) - President Donald Trump ...,1,test
1,BEIJING (Reuters) - Beijing police investigati...,1,train
2,"HARRISBURG, Pa. (Reuters) - Green Party candid...",1,val
3,"TOULON, France (Reuters) - France s defense mi...",1,train
4,WASHINGTON (Reuters) - New York State Attorney...,1,train
5,LJUBLJANA (Reuters) - Slovenian President Boru...,1,train
6,TORONTO (Reuters) - U.S. Republican presidenti...,1,train
7,SYDNEY (Reuters) - Australia s main Labor oppo...,1,val
8,There is most definitely no love lost between ...,0,train
9,MOSCOW (Reuters) - No firm date has been set y...,1,train


In [11]:
len(df[df['part'] == 'train']), len(df[df['part'] == 'val']), len(df[df['part'] == 'test'])

(30918, 3863, 3866)

In [12]:
df.head(10).to_csv('../data/sample.csv', index=None)

In [13]:
for index, row in df.iterrows():
    if index > 10:
        break
    print(index, row['text'], row['class'], '\n')

0 WASHINGTON (Reuters) - President Donald Trump on Wednesday ordered Education Secretary Betsy DeVos to review the U.S. government’s role in school policy, which supporters cheered as the first step in creating more local control in education and critics worried could lead to lower quality schools in poorer neighborhoods. DeVos has 300 days “to review and, if necessary, modify and repeal regulations and guidance issued by the Department of Education with a clear mandate to identify places where D.C. has overstepped its legal authority,” said Rob Goad, a Department of Education official, according to a transcript of a White House call with reporters. The second most powerful Republican in the House of Representatives, California’s Kevin McCarthy, said the federal government had in recent years exceeded its legal authority in creating regulations and guidance “Different people in different states and communities will have different goals and ways of achieving those goals. That is somethi

In [14]:
example_text = df.iloc[df[df['text'] == 'Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit '].index, 0].values[0]
print(example_text)

Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit 


We need to take care about names (i.e. @jamiedupree should be treated as separate token), hash tags (#Inauguration is one token here). Also let's say that we want to keep web sites as one token (pic.twitter.com/APVtyyYote or https://t.co/1dvY5lxdKo).

In [15]:
def split_into_sentences(text):
    # so the website will not split into two separate sentences by comma:
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(?=\s|[#])')
    sentences = sentence_endings.split(text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

sentences = split_into_sentences(example_text)
for sentence in sentences:
    print(sentence)

Boos and chants of  Lock her up!
were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.
#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit


In [16]:
def split_into_words(sentences):
    # Regular expression to match URLs, hashtags, handles, words, and standalone punctuation
    word_pattern = re.compile(r'pic.twitter.com/\S+|https?://\S+|www\.\S+|\#\S+|\@\w+|\b\w+\'?\w*|[\w\'-]+|[.,!?;]')
    tokenized_sentences = []
    for sentence in sentences:
        words = word_pattern.findall(sentence)
        tokenized_sentences.append(words)
    return tokenized_sentences

tokenized = split_into_words(sentences)
for tokens in tokenized:
    print(tokens)

['Boos', 'and', 'chants', 'of', 'Lock', 'her', 'up', '!']
['were', 'heard', 'in', 'the', 'crowd', 'assembled', 'at', 'the', 'West', 'Front', 'of', 'the', 'U', '.', 'S', '.', 'Capitol', 'Friday', 'morning', 'when', 'defeated', 'Democratic', 'Party', 'presidential', 'nominee', 'Hillary', 'Clinton', 'was', 'introduced', 'at', 'the', 'inaugural', 'ceremony', 'for', 'President', '-elect', 'Donald', 'Trump', '.']
['#InaugurationDay', 'Lock', 'her', 'up', 'pic.twitter.com/APVtyyYote', 'Bill', 'Simms', '@Mittens1245', 'January', '20', ',', '2017The', 'crowd', 'on', 'the', 'mall', 'booed', 'when', 'the', 'jumbotron', 'showed', 'a', 'close', '-up', 'shot', 'of', 'Hillary', 'Clinton', 'at', '#Inauguration', 'https://t.co/1dvY5lxdKo', 'gpbnews', '@gpbnews', 'January', '20', ',', '2017Some', 'in', 'crowd', 'chanting', 'LOCK', 'HER', 'UP', 'as', 'Hillary', 'Clinton', 'arrives', 'Jamie', 'Dupree', '@jamiedupree', 'January', '20', ',', '2017Via', 'Gateway', 'Pundit']


In [19]:
def stem_words(tokenized_sentences, language="english"):
    stemmer = SnowballStemmer(language)
    stemmed_sentences = []
    for tokens in tokenized_sentences:
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        stemmed_sentences.append(stemmed_tokens)
    return stemmed_sentences

stemmed = stem_words(tokenized)
for s in stemmed:
    print(s)

['boo', 'and', 'chant', 'of', 'lock', 'her', 'up', '!']
['were', 'heard', 'in', 'the', 'crowd', 'assembl', 'at', 'the', 'west', 'front', 'of', 'the', 'u', '.', 's', '.', 'capitol', 'friday', 'morn', 'when', 'defeat', 'democrat', 'parti', 'presidenti', 'nomine', 'hillari', 'clinton', 'was', 'introduc', 'at', 'the', 'inaugur', 'ceremoni', 'for', 'presid', '-elect', 'donald', 'trump', '.']
['#inaugurationday', 'lock', 'her', 'up', 'pic.twitter.com/apvtyyyot', 'bill', 'simm', '@mittens1245', 'januari', '20', ',', '2017the', 'crowd', 'on', 'the', 'mall', 'boo', 'when', 'the', 'jumbotron', 'show', 'a', 'close', '-up', 'shot', 'of', 'hillari', 'clinton', 'at', '#inaugur', 'https://t.co/1dvy5lxdko', 'gpbnew', '@gpbnew', 'januari', '20', ',', '2017some', 'in', 'crowd', 'chant', 'lock', 'her', 'up', 'as', 'hillari', 'clinton', 'arriv', 'jami', 'dupre', '@jamiedupre', 'januari', '20', ',', '2017via', 'gateway', 'pundit']


In [21]:
def lemmatize_tokens(tokenized_sentences):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentences = []
    for tokens in tokenized_sentences:
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        lemmatized_sentences.append(lemmatized_tokens)
    return lemmatized_sentences

lemmatized = lemmatize_tokens(tokenized)
for l in lemmatized:
    print(l)

['Boos', 'and', 'chant', 'of', 'Lock', 'her', 'up', '!']
['were', 'heard', 'in', 'the', 'crowd', 'assembled', 'at', 'the', 'West', 'Front', 'of', 'the', 'U', '.', 'S', '.', 'Capitol', 'Friday', 'morning', 'when', 'defeated', 'Democratic', 'Party', 'presidential', 'nominee', 'Hillary', 'Clinton', 'wa', 'introduced', 'at', 'the', 'inaugural', 'ceremony', 'for', 'President', '-elect', 'Donald', 'Trump', '.']
['#InaugurationDay', 'Lock', 'her', 'up', 'pic.twitter.com/APVtyyYote', 'Bill', 'Simms', '@Mittens1245', 'January', '20', ',', '2017The', 'crowd', 'on', 'the', 'mall', 'booed', 'when', 'the', 'jumbotron', 'showed', 'a', 'close', '-up', 'shot', 'of', 'Hillary', 'Clinton', 'at', '#Inauguration', 'https://t.co/1dvY5lxdKo', 'gpbnews', '@gpbnews', 'January', '20', ',', '2017Some', 'in', 'crowd', 'chanting', 'LOCK', 'HER', 'UP', 'a', 'Hillary', 'Clinton', 'arrives', 'Jamie', 'Dupree', '@jamiedupree', 'January', '20', ',', '2017Via', 'Gateway', 'Pundit']


In [22]:
def process_text(text):
    sentences = split_into_sentences(text)
    tokenized = split_into_words(sentences)
    stemmed = stem_words(tokenized)
    lemmatized = lemmatize_tokens(tokenized)
    dfs = []
    for i in range(len(tokenized)):
        data = []
        for j in range(len(tokenized[i])):
            row = {
                'Token': tokenized[i][j],
                'Stem': stemmed[i][j],
                'Lemma': lemmatized[i][j]
            }
            data.append(row)
        df = pd.DataFrame(data)
        dfs.append(df)
    return dfs

In [23]:
df.head()

Unnamed: 0,text,class,part
0,WASHINGTON (Reuters) - President Donald Trump ...,1,test
1,BEIJING (Reuters) - Beijing police investigati...,1,train
2,"HARRISBURG, Pa. (Reuters) - Green Party candid...",1,val
3,"TOULON, France (Reuters) - France s defense mi...",1,train
4,WASHINGTON (Reuters) - New York State Attorney...,1,train


In [24]:
def write_dataset(df, part):
    for index, row in tqdm(df.iterrows(), total=len(df)):
        text = row['text']
        class_ = 'true' if row['class'] else 'fake'
        dir = f'../assets/annotated-corpus/{part}/{class_}'
        Path(dir).mkdir(parents=True, exist_ok=True)
        path = dir / Path(str(index) + '.tsv')

        sentence_dfs = process_text(text)
        with open(path, 'w') as f:
            for sentence_df in sentence_dfs:
                sentence_df.to_csv(f, index=None, sep='\t', header=None)
                f.write('\n')

In [27]:
%%time
for part in ['train', 'val', 'test']:
    write_dataset(df[df['part'] == part], part)

100%|██████████| 30918/30918 [13:09<00:00, 39.19it/s]
100%|██████████| 3863/3863 [01:23<00:00, 46.40it/s]
100%|██████████| 3866/3866 [01:30<00:00, 42.91it/s]

CPU times: user 14min 22s, sys: 40.4 s, total: 15min 2s
Wall time: 16min 2s



