In [1]:
import gensim
import string
import nltk
import sparknlp
import numpy as np

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /common/home/ac1771/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /common/home/ac1771/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
df_raw = spark.read.json('/common/users/shared/cs543_fall22_group3/combined/combined_raw')

In [3]:
df_raw = df_raw.na.drop(subset=["selected_text"]).drop("publish_date")

In [4]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

# Remove punctuation, stop words, and lower case the letters
def remove_stopwords(line):
    tokens = word_tokenize(line)
    remove_stopwords = [lemmatize_stemming(t.lower()) for t in tokens if not t in STOPWORDS and not t in string.punctuation and len(t) > 2]
    return ",".join(remove_stopwords)

remove_stopwords_udf = F.udf(lambda z: remove_stopwords(z))
processed_df = df_raw.withColumn("cleaned_text", remove_stopwords_udf(F.col("selected_text")))

In [5]:
processed_df.printSchema()

root
 |-- selected_text: string (nullable = true)
 |-- cleaned_text: string (nullable = true)



In [6]:
processed_df.write.mode("Overwrite").json('/common/users/shared/cs543_fall22_group3/combined/combined_processed')

In [12]:
for i in range(1000, 1005):
    print('Raw text: {}'.format(processed_df.take(2000)[i].selected_text))
    print('Cleaned text: {}'.format(processed_df.take(2000)[i].cleaned_text))
    print()

Raw text: House approves bill to ensure 9/11 victims fund never runs out
Cleaned text: house,approve,ensure,9/11,victims,fund,run

Raw text: Ben & Jerry's created a deep-fried Phish and Chips ice cream
Cleaned text: ben,jerry,create,deep-fry,phish,chip,ice,cream

Raw text: "" Simmons added. ""It's definitely sparking a lot of curiosities!""It was first reported in 2009 that the ice cream giant was considering a fish and chips flavor
Cleaned text: simmons,add,definitely,spark,lot,curiosities,report,2009,ice,cream,giant,consider,fish,chip,flavor

Raw text:  sweet and salty â€“ reels in a treat that will have ice cream lovers hooked!""This isn't the first time Ben & Jerry's has tested out-of-the-box treats in London. Last year the company launched ice cream bagel sandwiches
Cleaned text: sweet,salty,reel,treat,ice,cream,lovers,hook,this,n't,time,ben,jerry,test,out-of-the-box,treat,london,last,year,company,launch,ice,cream,bagel,sandwich

Raw text: Nike keeps plans for Arizona factory desp