# todo'lar icin arastirmalar

In [None]:

"""
tags_hashing_tf = HashingTF(inputCol="tags_list", outputCol="tags_raw_features", numFeatures=5000)
tags_idf = IDF(inputCol="tags_raw_features", outputCol="tags_features")
feature_assembler = VectorAssembler(
    inputCols=["text_features", "tags_features", "title_len", "body_len"], outputCol="features")
lr_pipeline = Pipeline(stages=[label_indexer, tokenizer, stopwords_remover, hashing_tf, idf, tags_hashing_tf, tags_idf,
feature_assembler, lr])
df_clean = df.na.drop(subset=["Title", "Body", "Tags", "Y"])
df_clean = df_clean.withColumn("tags_list", spark_split(regexp_replace(col("Tags"), "[<>]", " "), " "))
"""

"""
ngram = NGram(n=2, inputCol="filtered_words", outputCol="bigrams")
hashing_tf = HashingTF(inputCols=["filtered_words", "bigrams"], outputCol="raw_features", numFeatures=20000)
lr_pipeline = Pipeline(stages=[label_indexer, tokenizer, stopwords_remover, ngram, hashing_tf, idf, feature_assembler, lr])
"""

"""
word2vec = Word2Vec(vectorSize=100, minCount=5, inputCol="filtered_words", outputCol="w2v_features")
# vector size kelime basina temsil vektoru(HP), mincount modele eklenecek kelimenin min frekansi
feature_assembler = VectorAssembler(inputCols=["w2v_features", "title_len", "body_len"],outputCol="features")
lr_pipeline = Pipeline(stages=[label_indexer, tokenizer, stopwords_remover, word2vec, feature_assembler, lr])
"""

""" 
# hp opt icin: pyspark'ta crossvalidator ve paramgridbuilder
pipeline_for_tuning = Pipeline(stages=[label_indexer, tokenizer, stopwords_remover, hashing_tf, idf, feature_assembler])
rf_pipeline = Pipeline(stages=[label_indexer, tokenizer, stopwords_remover, hashing_tf, idf, feature_assembler, rf])
rf.setSeed(42); paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [50, 100, 150]).addGrid(rf.maxDepth, [5, 10, 20]).build()
crossval = CrossValidator(estimator=rf_pipeline, estimatorParamMaps=paramGrid,
evaluator=MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1"),
numFolds=3); cv_model = crossval.fit(train_data); rf_predictions = cv_model.transform(test_data);
best_model = cv_model.bestModel #en iyi parametreler:best_model.stages[-1].getNumTrees() ve getMaxDepth()
"""

1. import'lar ve udf'ler

In [None]:
from pyspark.sql import SparkSession; import re; import traceback;
from pyspark.sql.functions import udf, col, length, concat_ws 
from pyspark.sql.types import StringType, IntegerType, DoubleType, ArrayType
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, VectorAssembler, NGram
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

def count_punctuation(text):
    if text is None: return 0
    return len(re.findall(r'[?!]', text))

def avg_word_length(text):
    if text is None: return 0.0
    words = text.split()
    if not words: return 0.0
    return sum(len(word) for word in words) / len(words)

def remove_html_tags(text):
    if text is None: return None
    return re.sub(re.compile('<.*?>'), '', text)

def clean_tags(tags):
    if tags is None: return []
    return tags.replace('<', ' ').replace('>', ' ').strip().split()

count_punct_udf = udf(count_punctuation, IntegerType())
avg_word_len_udf = udf(avg_word_length, DoubleType())
remove_html_udf = udf(remove_html_tags, StringType())
clean_tags_udf = udf(clean_tags, ArrayType()) 

2. main

In [None]:
spark = SparkSession.builder.appName("CPU_IMP").config("spark.driver.memory", "8g").getOrCreate() 
data_path = "data/train.csv"

try:
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("quote", "\"") \
        .option("escape", "\"") \
        .option("multiLine", "true") \
        .option("inferSchema", "true") \
        .load(data_path)
    df.printSchema()
    df.groupBy("Y").count().show()
    initial_count = df.count()
    df_clean = df.na.drop(subset=["Title", "Body", "Y"]).withColumn("CleanBody", remove_html_udf(col("Body")))\
        .withColumn("text", concat_ws(" ", col("Title"), col("CleanBody")))
    
    label_indexer = StringIndexer(inputCol="Y", outputCol="label", handleInvalid="skip")
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
    
    hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=20000)
    idf = IDF(inputCol="raw_features", outputCol="text_features")
    
    df_featured = df_clean.withColumn("title_len", length(col("Title"))) \
        .withColumn("body_len", length(col("CleanBody"))) \
        .withColumn("punct_count", count_punct_udf(col("text"))) \
        .withColumn("avg_word_len", avg_word_len_udf(col("text")))

    feature_assembler = VectorAssembler(
        inputCols=["text_features", "title_len", "body_len", "punct_count", "avg_word_len"],
        outputCol="features"
    )

    (train_data, test_data) = df_featured.randomSplit([0.8, 0.2], seed=42)
    train_data.cache(); test_data.cache()
    
    lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
    lr_pipeline = Pipeline(stages=[label_indexer, tokenizer, stopwords_remover, hashing_tf, idf, feature_assembler, lr])
    lr_model = lr_pipeline.fit(train_data)
    lr_predictions = lr_model.transform(test_data)
    
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    accuracy = evaluator.setMetricName("accuracy").evaluate(lr_predictions)
    f1_score = evaluator.setMetricName("f1").evaluate(lr_predictions)
    
    print("\nLogistic Regression Evaluation")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    print("Confusion Matrix:")
    lr_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()


    rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
    rf_pipeline = Pipeline(stages=[label_indexer, tokenizer, stopwords_remover, hashing_tf, idf, feature_assembler, rf])
    
    rf_model = rf_pipeline.fit(train_data)
    rf_predictions = rf_model.transform(test_data)
    accuracy = evaluator.setMetricName("accuracy").evaluate(rf_predictions)
    f1_score = evaluator.setMetricName("f1").evaluate(rf_predictions)

    print("\nRandom Forest Evaluation")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    print("Confusion Matrix:")
    rf_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

    spark.stop()

except Exception as e:
    print(f"{e}")
    traceback.print_exc()
    spark.stop()