In [1]:
import gensim
import string
import nltk
import sparknlp
import numpy as np

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /common/home/ac1771/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /common/home/ac1771/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
df_raw = spark.read.json('/common/users/shared/cs543_fall22_group3/combined/combined_raw')

In [3]:
df_raw = df_raw.na.drop(subset=["selected_text"]).drop("publish_date")

In [4]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

# Remove punctuation, stop words, and lower case the letters
def remove_stopwords(line):
    tokens = word_tokenize(line)
    remove_stopwords = [lemmatize_stemming(t.lower()) for t in tokens if not t in STOPWORDS and not t in string.punctuation and len(t) > 2]
    return ",".join(remove_stopwords)

remove_stopwords_udf = F.udf(lambda z: remove_stopwords(z))
processed_df = df_raw.withColumn("cleaned_text", remove_stopwords_udf(F.col("selected_text")))

In [5]:
processed_df.printSchema()

root
 |-- selected_text: string (nullable = true)
 |-- cleaned_text: string (nullable = true)



In [6]:
processed_df.write.mode("Overwrite").json('/common/users/shared/cs543_fall22_group3/combined/combined_processed')

In [2]:
processed_df = spark.read.json('/common/users/shared/cs543_fall22_group3/combined/combined_processed')
processed_df.tail(10)

[Row(cleaned_text=['contract', 'archaeology', 'deep', 'slump'], selected_text='UK Contract Archaeology in Deep Slump'),
 Row(cleaned_text=['moscow', 'probably', 'interruptions', 'latvia', 'mobile', 'communications', 'network', 'russia', 'war', 'game', 'month', 'apparent', 'test', 'cyber', 'attack', 'tool', 'baltic', 'nato', 'officials', 'say', 'base', 'early', 'intelligence', 'drill'], selected_text="Moscow was probably behind interruptions in Latvia's mobile communications network before Russia's war games last month, in an apparent test of its cyber attack tools, Baltic and NATO officials said, based on early intelligence of the drills."),
 Row(cleaned_text=['energy', 'network', 'infrastructure', 'investment', 'prison', 'plan', 'sell', 'townsville', 'port', 'new', 'announcements', 'north', 'queensland', '2014', '2015', 'state', 'budget'], selected_text='Energy network infrastructure, investment in the prison and plans to sell the Townsville Port were new announcements for north Queen