In [1]:
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from pyspark.sql.functions import udf
import nltk
import string

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /common/home/vig4/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /common/home/vig4/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
ROOT = '../dataset/combined'

In [3]:
raw_df = spark.read.json(f'{ROOT}/combined_raw')
raw_df.show()

+--------------------+
|             article|
+--------------------+
|U.S. investment b...|
|Mexican coffee pr...|
|Oil companies are...|
|Adam Dunn … you a...|
|The surprising re...|
|» County’s annual...|
|Dropcam will prob...|
|Wild Insects need...|
|British scientist...|
|The Big Bang Theo...|
|A nurse made the ...|
|Mother-of-three D...|
|The Scots were ki...|
|He may not be com...|
|The wreckage of t...|
|Inter Milan were ...|
|18-year-old Aditi...|
|Tiger Woods grima...|
|Danny Cipriani to...|
|US jobs numbers a...|
+--------------------+
only showing top 20 rows



In [4]:
raw_df.count()

40954102

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    remove_stopwords = [lemmatizer.lemmatize(t.lower()) for t in tokens if not t in STOPWORDS and t not in string.punctuation and len(t) > 2]

    return ','.join(remove_stopwords)

In [7]:
preprocess_udf = udf(preprocess_text)
processed_df = raw_df.withColumn('article', preprocess_udf(raw_df['article']))
processed_df.show()

+--------------------+
|             article|
+--------------------+
|u.s.,investment,b...|
|mexican,coffee,pr...|
|oil,company,large...|
|adam,dunn,sure,6-...|
|the,surprising,re...|
|county,annual,eve...|
|dropcam,probably,...|
|wild,insect,neede...|
|british,scientist...|
|the,big,bang,theo...|
|nurse,discovery,c...|
|mother-of-three,d...|
|the,scot,kicked,g...|
|coming,home,covet...|
|the,wreckage,car,...|
|inter,milan,held,...|
|18-year-old,aditi...|
|tiger,wood,grimac...|
|danny,cipriani,to...|
|job,number,add,is...|
+--------------------+
only showing top 20 rows



In [8]:
raw_df_2000 = raw_df.limit(2000).collect()
processed_df_2000 = processed_df.limit(2000).collect()

for i in range(1000, 1005):
    print(f'Raw text: {raw_df_2000[i]["article"]}')
    print(f'Cleaned text: {processed_df_2000[i]["article"]}')
    print()

Raw text: When a swinging door takes up too much space, just slip it inside the wall with a pocket door.
Cleaned text: when,swinging,door,take,space,slip,inside,wall,pocket,door

Raw text: BUENOS AIRES, Puntarenas – As government mediators return to San José and peace slowly returns to the Salitre indigenous reserve in Costa Rica’s southeastern region, the charred skeleton of a makeshift home remains as the only visible vestige of an intense conflict earlier this week.
Cleaned text: buenos,aire,puntarenas,government,mediator,return,san,josé,peace,slowly,return,salitre,indigenous,reserve,costa,rica,southeastern,region,charred,skeleton,makeshift,home,remains,visible,vestige,intense,conflict,earlier,week

Raw text: Director Luke Scott's Morgan is a very forgettable sci-fi thriller that fails to achieve the highs of last year's Ex-Machina.
Cleaned text: director,luke,scott,morgan,forgettable,sci-fi,thriller,fails,achieve,high,year,ex-machina

Raw text: HBO announced today that production h

In [9]:
processed_df.write.mode('overwrite').json(f'{ROOT}/combined_processed')