In [0]:
test_df=spark.read.load("/Volumes/luffy/phase2/silver/test_df/")
train_df=spark.read.load("/Volumes/luffy/phase2/silver/train_df/")

# Logistic Regression

In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    featuresCol="features",
    labelCol="is_high_valued",
    weightCol="class_weight"
)

from pyspark.ml.tuning import ParamGridBuilder

paramGrid_lr = (ParamGridBuilder()
    .addGrid(lr.regParam, [0.0, 0.01, 0.1])
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
    .build())

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

reg_params = [0.0, 0.01, 0.1]
elastic_net_params = [0.0, 0.5, 1.0]

best_model = None
best_score = float('-inf')
best_params = None

for reg in reg_params:
    for enet in elastic_net_params:
        lr = LogisticRegression(featuresCol="features", labelCol="is_high_valued",
                               regParam=reg, elasticNetParam=enet)
        model = lr.fit(train_df)
        predictions = model.transform(test_df)
        evaluator = BinaryClassificationEvaluator(labelCol="is_high_valued", metricName="areaUnderROC")
        score = evaluator.evaluate(predictions)
        if score > best_score:
            best_score = score
            best_model = model
            best_params = {'regParam': reg, 'elasticNetParam': enet}

print(f"Best params: {best_params}, Best ROC AUC: {best_score}")


Best params: {'regParam': 0.0, 'elasticNetParam': 0.0}, Best ROC AUC: 1.0


#RandomForest

In [0]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

num_trees = [10, 30, 60]
max_depths = [5, 10, 20]

best_rf_model = None
best_rf_score = float('-inf')
best_rf_params = None

for ntrees in num_trees:
    for depth in max_depths:
        rf = RandomForestClassifier(
            featuresCol="features",
            labelCol="is_high_valued",
            numTrees=ntrees,
            maxDepth=depth
        )
        rf_model = rf.fit(train_df)
        rf_predictions = rf_model.transform(test_df)
        evaluator = BinaryClassificationEvaluator(labelCol="is_high_valued", metricName="areaUnderROC")
        rf_score = evaluator.evaluate(rf_predictions)
        if rf_score > best_rf_score:
            best_rf_score = rf_score
            best_rf_model = rf_model
            best_rf_params = {'numTrees': ntrees, 'maxDepth': depth}

print(f"Best params: {best_rf_params}, Best ROC AUC: {best_rf_score}")

Best params: {'numTrees': 30, 'maxDepth': 5}, Best ROC AUC: 0.9997337375061446
