In [None]:
!pip install nltk



In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Fake New Identifier").getOrCreate()

In [87]:
df=spark.read.csv('/content/sample_data/test.csv', header=True)
df=df.select("id","statement")
df=df.dropna()
df.show(1, truncate=False)


+-----+----------------------------------------------------------------------------------------------------------------------------------------------+
|id   |statement                                                                                                                                     |
+-----+----------------------------------------------------------------------------------------------------------------------------------------------+
|21750|"Three doctors from the same hospital 'die suddenly' in the same week,"" after the hospital mandated a fourth COVID-19 vaccine for employees."|
+-----+----------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row



In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

**Tokenization**

In [88]:
from nltk.tokenize import sent_tokenize
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

def tokenize_sentences(text):
  try:
    return sent_tokenize(text)
  except:
    return []

udf_tokenize_sentences = udf(tokenize_sentences, ArrayType(StringType()))
df = df.withColumn("sentence",udf_tokenize_sentences(F.col("statement")))
df.select('statement','sentence').show(1, truncate=False)


+----------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|statement                                                                                                                                     |sentence                                                                                                                                        |
+----------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|"Three doctors from the same hospital 'die suddenly' in the same week,"" after the hospital mandated a fourth COVID-19 vaccine fo

**Removing Stop words**

In [89]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  try:
    for i in range(len(text)):
      words=nltk.word_tokenize(text[i])
      words_without_stopword=[word for word in words if word.lower() not in stop_words]
      text[i]=''.join(words_without_stopword)
    return words_without_stopword
  except:
    return []

udf_remove_stopwords = udf(remove_stopwords, ArrayType(StringType()))

df = df.withColumn("filtered_sentence",udf_remove_stopwords(F.col("sentence")))

df.select("sentence","filtered_sentence").show(1, truncate=False)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


+------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------+
|sentence                                                                                                                                        |filtered_sentence                                                                                                                  |
+------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------+
|["Three doctors from the same hospital 'die suddenly' in the same week,"" after the hospital mandated a fourth COVID-19 vaccine for employees."]|[``, Three, docto

**Stemming**

In [90]:
from typing_extensions import Text
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

def stemming(text):
  try:
    for words in range(len(text)):
      word=stemmer.stem(text[words])
      text[words]=''.join(word)
    return text
  except:
    return []

udf_stemming = udf(stemming, ArrayType(StringType()))

df = df.withColumn("stemmed_sentence",udf_stemming(F.col("filtered_sentence")))
df.select("id","statement","stemmed_sentence").show(1, truncate=False)





+-----+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
|id   |statement                                                                                                                                     |stemmed_sentence                                                                                                      |
+-----+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
|21750|"Three doctors from the same hospital 'die suddenly' in the same week,"" after the hospital mandated a fourth COVID-19 vaccine for employees."|[``, three, doctor, hospit, die, sudden,