1. Lib Imports & UDF Defs

In [None]:
import re
import traceback
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, length, concat_ws
from pyspark.sql.types import StringType, IntegerType, DoubleType, ArrayType
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, 
    VectorAssembler, NGram
)
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.ml.feature import Word2Vec

def count_punctuation(text):
    if text is None: return 0
    return len(re.findall(r'[?!]', text))

def avg_word_length(text):
    if text is None: return 0.0
    words = text.split()
    if not words: return 0.0
    return sum(len(word) for word in words) / len(words)

def remove_html_tags(text):
    if text is None: return None
    return re.sub(re.compile('<.*?>'), '', text)

def clean_tags(tags):
    if tags is None: return []
    return tags.replace('<', ' ').replace('>', ' ').strip().split()

count_punct_udf = udf(count_punctuation, IntegerType())
avg_word_len_udf = udf(avg_word_length, DoubleType())
remove_html_udf = udf(remove_html_tags, StringType())
clean_tags_udf = udf(clean_tags, ArrayType(StringType()))

2. Main

In [None]:
spark = SparkSession.builder.appName("CPU_IMP_RE_W2V") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

data_path = "data/train.csv"

try:
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("quote", "\"") \
        .option("escape", "\"") \
        .option("multiLine", "true") \
        .option("inferSchema", "true") \
        .load(data_path)
    
    print("data schema:")
    df.printSchema()
    df.groupBy("Y").count().show()

    df_clean = df.na.drop(subset=["Title", "Body", "Tags", "Y"]) \
        .withColumn("CleanBody", remove_html_udf(col("Body"))) \
        .withColumn("text", concat_ws(" ", col("Title"), col("CleanBody"))) \
        .withColumn("tags_list", clean_tags_udf(col("Tags")))
    
    df_featured = df_clean.withColumn("title_len", length(col("Title"))) \
        .withColumn("body_len", length(col("CleanBody"))) \
        .withColumn("punct_count", count_punct_udf(col("text"))) \
        .withColumn("avg_word_len", avg_word_len_udf(col("text")))

    label_indexer = StringIndexer(inputCol="Y", outputCol="label", handleInvalid="skip")
    
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
    ngram = NGram(n=2, inputCol="filtered_words", outputCol="bigrams")
    
    #hashing_tf_text = HashingTF(inputCol="filtered_words", outputCol="raw_text_features", numFeatures=20000)
    #idf_text = IDF(inputCol="raw_text_features", outputCol="text_features")
    
    #hashing_tf_bigrams = HashingTF(inputCol="bigrams", outputCol="raw_bigrams_features", numFeatures=20000)
    #idf_bigrams = IDF(inputCol="raw_bigrams_features", outputCol="bigrams_features")
    
    w2v = Word2Vec(vectorSize=100, minCount=5, inputCol="filtered_words", outputCol="w2v_features")
    
    hashing_tf_tags = HashingTF(inputCol="tags_list", outputCol="raw_tags_features", numFeatures=5000)
    idf_tags = IDF(inputCol="raw_tags_features", outputCol="tags_features")

    feature_assembler = VectorAssembler(
        inputCols=["w2v_features", "tags_features", "title_len", "body_len", "punct_count", "avg_word_len"],
        outputCol="features"
    )

    (train_data, test_data) = df_featured.randomSplit([0.8, 0.2], seed=42)
    train_data.cache(); test_data.cache()
    
    lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
    lr_pipeline = Pipeline(stages=[
        label_indexer, tokenizer, stopwords_remover, ngram, 
        w2v, hashing_tf_tags, idf_tags, 
        feature_assembler, lr
    ])
    
    lr_model = lr_pipeline.fit(train_data)
    lr_predictions = lr_model.transform(test_data)
    
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    accuracy = evaluator.setMetricName("accuracy").evaluate(lr_predictions)
    f1_score = evaluator.setMetricName("f1").evaluate(lr_predictions)
    
    print("\nlog reg results:")
    print(f"accuracy: {accuracy:.4f}")
    print(f"F1 score: {f1_score:.4f}")
    print("conf m:")
    lr_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

    rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=42)
    rf_pipeline = Pipeline(stages=[
        label_indexer, tokenizer, stopwords_remover, ngram, 
        w2v, hashing_tf_tags, idf_tags, 
        feature_assembler, rf
    ])

    paramGrid = ParamGridBuilder() \
        .addGrid(rf.numTrees, [50, 100]) \
        .addGrid(rf.maxDepth, [5, 10]) \
        .build()

    crossval = CrossValidator(estimator=rf_pipeline, 
                              estimatorParamMaps=paramGrid,
                              evaluator=MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1"),
                              numFolds=3) 
    
    print("\nRF HP optimization:")
    cv_model = crossval.fit(train_data)
    best_rf_model = cv_model.bestModel
    rf_predictions = best_rf_model.transform(test_data)
    
    accuracy = evaluator.setMetricName("accuracy").evaluate(rf_predictions)
    f1_score = evaluator.setMetricName("f1").evaluate(rf_predictions)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1_score:.4f}")

    best_params = best_rf_model.stages[-1]
    print(f"best params: numTrees={best_params.numTrees}, maxDepth={best_params.maxDepth}")
    
    print("conf m:")
    rf_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

except Exception as e: traceback.print_exc()
finally: spark.stop()


data schema:
root
 |-- Id: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Body: string (nullable = true)
 |-- Tags: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- Y: string (nullable = true)

+--------+-----+
|       Y|count|
+--------+-----+
|LQ_CLOSE|15000|
|      HQ|15000|
| LQ_EDIT|15000|
+--------+-----+


log reg results:
accuracy: 0.6659
F1 score: 0.6669
conf m:
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  0.0|       0.0| 1825|
|  0.0|       1.0|  785|
|  0.0|       2.0|  367|
|  1.0|       0.0|  887|
|  1.0|       1.0| 1832|
|  1.0|       2.0|  261|
|  2.0|       0.0|  447|
|  2.0|       1.0|  261|
|  2.0|       2.0| 2338|
+-----+----------+-----+


RF HP optimization:


# todo  word2vec = Word2Vec(vectorSize=100, minCount=5, inputCol="filtered_words", outputCol="w2v_features")