In [15]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
from emoji import replace_emoji
import re

In [16]:
spark = SparkSession.builder \
    .appName("Twitter Sentiment Analysis") \
    .master("local[*]") \
    .config("spark.driver.memory", "4G") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

In [17]:
train_df = spark.read.option("header", "true").csv("./dataset/train.csv")
test_df = spark.read.option("header", "true").csv("./dataset/test.csv")

In [19]:
train_df.createOrReplaceTempView("train_data")
spark.sql("SELECT * FROM train_data").show()

+------+----------+--------------------+--------+---------------+--------------------+
|target|       ids|                date|    flag|           user|                text|
+------+----------+--------------------+--------+---------------+--------------------+
|     0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|     0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|     0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|     0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|     0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
|     0|1467811372|Mon Apr 06 22:20:...|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|     0|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|     0|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|     0|1467811795|Mon Apr 06 22:20:...|NO_

In [None]:
def clean_text(text):
    if text is None: return ""

    text = replace_emoji(text, '')
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"[^\w\s]", "", text)

    return text.lower().strip()

clean_udf = udf(clean_text, StringType())

train_df = train_df.withColumn("clean_text", clean_udf(col("text")))
test_df = test_df.withColumn("clean_text", clean_udf(col("text")))

In [5]:
tokenizer = RegexTokenizer() \
    .setInputCol("clean_text") \
    .setOutputCol("tokens") \
    .setPattern("\\W+") \
    .setGaps(True)

remover = StopWordsRemover() \
    .setInputCol("tokens") \
    .setOutputCol("filtered_text")

hashingTF = HashingTF() \
    .setInputCol("filtered_text") \
    .setOutputCol("tf_features") \
    .setNumFeatures(10000)

idf = IDF() \
    .setInputCol("tf_features") \
    .setOutputCol("tfidf_features")

In [6]:
preprocessing_pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])
model = preprocessing_pipeline.fit(train_df)
model.write().overwrite().save("./preprocessing_pipeline")

                                                                                

In [7]:
train_transformed = model.transform(train_df)
test_transformed = model.transform(test_df)

train_transformed.select("clean_text", "tfidf_features").write.mode("overwrite").parquet("./output/transformed_train")
test_transformed.select("clean_text", "tfidf_features").write.mode("overwrite").parquet("./output/transformed_test")

                                                                                

In [8]:
spark.stop()