In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
from emoji import replace_emoji
import re

In [2]:
def clean_text(text):
    if text is None: return ""

    text = replace_emoji(text, '')
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"[^\w\s]", "", text)

    return text.lower().strip()

clean_udf = udf(clean_text, StringType())

In [3]:
spark = SparkSession.builder \
    .appName("Twitter Sentiment Analysis") \
    .master("local[*]") \
    .config("spark.driver.memory", "4G") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

25/05/04 21:46:14 WARN Utils: Your hostname, Arashs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.191 instead (on interface en0)
25/05/04 21:46:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/04 21:46:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
train_df = spark.read.option("header", "true").csv("./dataset/train.csv")
test_df = spark.read.option("header", "true").csv("./dataset/test.csv")

train_df = train_df.withColumn("clean_text", clean_udf(col("text")))
test_df = test_df.withColumn("clean_text", clean_udf(col("text")))

In [5]:
tokenizer = RegexTokenizer() \
    .setInputCol("clean_text") \
    .setOutputCol("tokens") \
    .setPattern("\\W+") \
    .setGaps(True)

remover = StopWordsRemover() \
    .setInputCol("tokens") \
    .setOutputCol("filtered_text")

hashingTF = HashingTF() \
    .setInputCol("filtered_text") \
    .setOutputCol("tf_features") \
    .setNumFeatures(10000)

idf = IDF() \
    .setInputCol("tf_features") \
    .setOutputCol("tfidf_features")

In [6]:
preprocessing_pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])
model = preprocessing_pipeline.fit(train_df)
model.write().overwrite().save("./preprocessing_pipeline")

                                                                                

In [7]:
train_transformed = model.transform(train_df)
test_transformed = model.transform(test_df)

train_transformed.select("clean_text", "tfidf_features").write.mode("overwrite").parquet("./output/transformed_train")
test_transformed.select("clean_text", "tfidf_features").write.mode("overwrite").parquet("./output/transformed_test")

                                                                                

In [8]:
spark.stop()