In [None]:
!pip install nltk



In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NLP_Data_Cleanup").getOrCreate()

In [16]:
df=spark.read.csv('/content/sample_data/test.csv', header=True)
df=df.select("id","statement","speaker")
df=df.dropna()
df.show(10)


+--------------------+--------------------+------------------+
|                  id|           statement|           speaker|
+--------------------+--------------------+------------------+
|               21750|"Three doctors fr...|the gateway pundit|
|               18173|Say Joe Biden is ...|    facebook posts|
|                   "|                  24|               245|
|               22673|A photo shows Pre...|       viral image|
|                9897|It will cost $50,...|     stuart varney|
|                3855|The Federal Regis...|      randy forbes|
|Forbes previously...|                NULL|              NULL|
|Forbes received a...|             in 1974|         in 1977."|
|                2884|"Following the 20...|          abortion|
|               18102|"America was the ...|    facebook posts|
+--------------------+--------------------+------------------+
only showing top 10 rows



In [22]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

**Tokenization**

In [28]:
from nltk.tokenize import sent_tokenize
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

def tokenize_sentences(text):
  try:
    return sent_tokenize(text)
  except:
    return []

udf_tokenize_sentences = udf(tokenize_sentences, ArrayType(StringType()))
df = df.withColumn("sentence",udf_tokenize_sentences(F.col("statement")))
df.select('statement','sentence').show(10, truncate=False)


+-----------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|statement                                                                                                                                      |sentence                                                                                                                                         |
+-----------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|"Three doctors from the same hospital 'die suddenly' in the same week,"" after the hospital mandated a fourth COVID-19 vacc

**Removing Stop words**

In [54]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  try:
    for i in range(len(text)):
      words=nltk.word_tokenize(text[i])
      words_without_stopword=[word for word in words if word.lower() not in stop_words]
      text[i]=''.join(words_without_stopword)
    return words_without_stopword
  except:
    return []

udf_remove_stopwords = udf(remove_stopwords, ArrayType(StringType()))

df = df.withColumn("filtered_sentence",udf_remove_stopwords(F.col("sentence")))


def remove_strings(text):
  try:
    for i in text:
      if len(i)<2:
        text.remove(i)
    return text
  except:
    return []

udf_remove_strings = udf(remove_strings, ArrayType(StringType()))

df = df.withColumn("filtered_sentence_1",udf_remove_strings(F.col("filtered_sentence")))
df.select("sentence","filtered_sentence","filtered_sentence_1").show(10, truncate=False)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


+-------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
|sentence                                                                                                                                         |filtered_sentence                                                                                                                                           |filtered_sentence_1                                                                                                                                |
+---------------------------------------------------------------------------------------------

**Stemming**

In [53]:
from typing_extensions import Text
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

def stemming(text):
  try:
    for words in range(len(text)):
      word=stemmer.stem(text[words])
      text[words]=''.join(word)
    return text
  except:
    return []

udf_stemming = udf(stemming, ArrayType(StringType()))

df = df.withColumn("stemmed_sentence",udf_stemming(F.col("filtered_sentence_1")))
df.select("id","statement","speaker","stemmed_sentence").show(10, truncate=False)





+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------+
|id                                                                                                                                                                                                |statement                                                                                                                                      |speaker           |stemmed_sentence                                                                                                          |
+---------------------------------