In [1]:
import pandas as pd
import numpy as np
import re
import swifter
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\phku0\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\phku0\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\phku0\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\phku0\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\phku0\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\phku0\AppData\

True

In [2]:
df = pd.read_parquet('../data_collection/data_warehouse/news_articles.parquet')
df = df[['title', 'summary', 'time_published', 'authors', 'source']]

print(df.shape)
df.head()

(1904666, 5)


Unnamed: 0_level_0,title,summary,time_published,authors,source
newsID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Oil prices could determine how markets react t...,The heavy new round of sanctions on Russia by ...,2022-03-01 00:02:02,[Patti Domm],CNBC
2,Zoom provides disappointing revenue forecast f...,Zoom's revenue growth is continuing to slow af...,2022-03-01 00:15:56,[Jordan Novet],CNBC
3,Wall Street rallies as West hits Russia with n...,"The SP 500 rose more than 1%, ending a four-da...",2022-03-01 00:46:51,[],Money Control
4,"Weak manufacturing drags down Q3 GDP growth, o...",India's economy grew 5.4% in the three months ...,2022-03-01 02:23:00,[www.ETCFO.com],Economic Times
5,Singapore banks halt lending for Russian goods...,Singapore's biggest banks are restricting trad...,2022-03-01 02:30:56,[Bloomberg],South China Morning Post


In [3]:
stop_words = set(stopwords.words('english'))
punctuation_set = set(string.punctuation)
lemmatizer = WordNetLemmatizer()

custom_stopwords = ['inc', 'ltd', 'co', 'com', 'plc', 'corp', 'llc']
stop_words.update(custom_stopwords)

def get_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess(text):
    # type check
    if not isinstance(text, str):
        return []
    
    # Only keep word character and space
    text = re.sub(r'[^A-Za-z\s]', ' ', text).strip()

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # lowercase
    text = text.lower()
    
    # tokenize
    tokens = word_tokenize(text)

    # Remove stopwords, punctuation, whitespace
    tokens = [token for token in tokens if token not in stop_words and token not in punctuation_set and token.strip() and len(token) > 2]

    # Part of speech tagging
    pos_tags = nltk.pos_tag(tokens)
    
    # Lemmatize with POS tagging
    tokens = [lemmatizer.lemmatize(token, get_pos(tag)) for token, tag in pos_tags]
    
    return tokens

df["title_preprocessed"] = df["title"].swifter.apply(preprocess)
df["summary_preprocessed"] = df["summary"].swifter.apply(preprocess)

Pandas Apply: 100%|██████████| 1904666/1904666 [10:11<00:00, 3115.65it/s]
Pandas Apply: 100%|██████████| 1904666/1904666 [19:46<00:00, 1605.22it/s]


In [4]:
# def extract_synonyms(word):
#     synonyms = set()
#     for syn in wordnet.synsets(word):
#         for lemma in syn.lemmas():
#             synonyms.add(lemma.name())
#     return synonyms

In [5]:
df.head()

Unnamed: 0_level_0,title,summary,time_published,authors,source,title_preprocessed,summary_preprocessed
newsID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Oil prices could determine how markets react t...,The heavy new round of sanctions on Russia by ...,2022-03-01 00:02:02,[Patti Domm],CNBC,"[oil, price, could, determine, market, react, ...","[heavy, new, round, sanction, russia, ally, li..."
2,Zoom provides disappointing revenue forecast f...,Zoom's revenue growth is continuing to slow af...,2022-03-01 00:15:56,[Jordan Novet],CNBC,"[zoom, provide, disappointing, revenue, foreca...","[zoom, revenue, growth, continue, slow, busine..."
3,Wall Street rallies as West hits Russia with n...,"The SP 500 rose more than 1%, ending a four-da...",2022-03-01 00:46:51,[],Money Control,"[wall, street, rally, west, hit, russia, new, ...","[rise, end, four, day, slide, amid, worry, esc..."
4,"Weak manufacturing drags down Q3 GDP growth, o...",India's economy grew 5.4% in the three months ...,2022-03-01 02:23:00,[www.ETCFO.com],Economic Times,"[weak, manufacturing, drag, gdp, growth, oil, ...","[india, economy, grow, three, month, end, dece..."
5,Singapore banks halt lending for Russian goods...,Singapore's biggest banks are restricting trad...,2022-03-01 02:30:56,[Bloomberg],South China Morning Post,"[singapore, bank, halt, lend, russian, good, j...","[singapore, big, bank, restrict, trade, financ..."


In [6]:
df = df[['title_preprocessed', 'summary_preprocessed']]
df.head()

Unnamed: 0_level_0,title_preprocessed,summary_preprocessed
newsID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[oil, price, could, determine, market, react, ...","[heavy, new, round, sanction, russia, ally, li..."
2,"[zoom, provide, disappointing, revenue, foreca...","[zoom, revenue, growth, continue, slow, busine..."
3,"[wall, street, rally, west, hit, russia, new, ...","[rise, end, four, day, slide, amid, worry, esc..."
4,"[weak, manufacturing, drag, gdp, growth, oil, ...","[india, economy, grow, three, month, end, dece..."
5,"[singapore, bank, halt, lend, russian, good, j...","[singapore, big, bank, restrict, trade, financ..."


In [7]:
df.to_parquet('data/text_preprocessed.parquet')