In [0]:
# Read CSV file
df = spark.sql("SELECT * FROM dbacademy.labuser9258060_1739805221.loan_data")

In [0]:
import mlflow
import mlflow.spark
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Impute rows with missing values
df = df.fillna(0)

# List of categorical columns to be encoded
categorical_columns = ["person_education", "person_home_ownership", "loan_intent", "previous_loan_defaults_on_file", "loan_status"]

# List of numerical columns
numerical_columns = ["person_age", "person_income", "person_emp_exp", "loan_amnt", "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length", "credit_score"]

for column in numerical_columns:
    df = df.withColumn(column, col(column).cast("float"))

# Create a StringIndexer and OneHotEncoder for each categorical column
indexers = [StringIndexer().setInputCol(col).setOutputCol(col + "_indexed") for col in categorical_columns]
encoders = [OneHotEncoder().setInputCol(col + "_indexed").setOutputCol(col + "_encoded") for col in categorical_columns]

# Assemble all feature columns into a single vector
encoded_feature_columns = [col + "_encoded" for col in categorical_columns]
feature_columns = encoded_feature_columns + numerical_columns
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Create a StringIndexer for the target column
label_indexer = StringIndexer().setInputCol("person_gender").setOutputCol("label")

# Combine all stages into a pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler, label_indexer])

# Fit and transform the data
df_preprocessed = pipeline.fit(df).transform(df)

# Split the data into training and test sets
train_df, test_df = df_preprocessed.randomSplit([0.8, 0.2], seed=42)

# Initialize the Random Forest classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)

# Start an MLflow run
with mlflow.start_run():
    # Train the model
    rf_model = rf.fit(train_df)
    
    # Log the model
    mlflow.spark.log_model(rf_model, "random_forest_model")
    
    # Make predictions on the test set
    predictions = rf_model.transform(test_df)
    
    # Evaluate the model
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
    recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
    f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
    
    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)
    
    print(f"Test set accuracy: {accuracy}")
    print(f"Test set precision: {precision}")
    print(f"Test set recall: {recall}")
    print(f"Test set F1-score: {f1}")

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Define the parameter grid
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [50, 100, 150])
             .addGrid(rf.maxDepth, [5, 10, 15])
             .build())

# Define the cross-validator
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)

# Start an MLflow run
with mlflow.start_run():
    # Perform cross-validation
    cv_model = crossval.fit(train_df)
    cv_predictions = cv_model.transform(test_df)
    cv_accuracy = evaluator.evaluate(cv_predictions)
    
    # Log the best model
    best_model = cv_model.bestModel
    mlflow.spark.log_model(best_model, "best_random_forest_model")
    
    # Log parameters and metrics
    mlflow.log_param("numTrees", best_model.getNumTrees)
    mlflow.log_param("maxDepth", best_model.getMaxDepth)
    mlflow.log_metric("cv_accuracy", cv_accuracy)
    
    print(f"Cross-validated accuracy: {cv_accuracy}")

In [0]:


evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Test set accuracy: {accuracy}")
print(f"Test set precision: {precision}")
print(f"Test set recall: {recall}")
print(f"Test set F1-score: {f1}")

In [0]:
df.show()

In [0]:

# impute rows with missing values
df = df.fillna(0)

# List of categorical columns to be encoded
categorical_columns = ["person_education", "person_home_ownership", "loan_intent", "previous_loan_defaults_on_file", "person_gender"]

# List of numerical columns
numerical_columns = ["person_age", "person_income", "person_emp_exp", "loan_amnt", "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length", "credit_score"]

for column in numerical_columns:
    df = df.withColumn(column, col(column).cast("float"))
    
# Create a StringIndexer and OneHotEncoder for each categorical column
indexers = [StringIndexer(inputCol=col, outputCol=col + "_indexed") for col in categorical_columns]
encoders = [OneHotEncoder(inputCol=col + "_indexed", outputCol=col + "_encoded") for col in categorical_columns]

# Assemble all feature columns into a single vector
encoded_feature_columns = [col + "_encoded" for col in categorical_columns]
feature_columns = encoded_feature_columns + numerical_columns
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Create a StringIndexer for the target column
label_indexer = StringIndexer(inputCol="loan_status", outputCol="label")

# Combine all stages into a pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler, label_indexer])

# Fit and transform the data
df_preprocessed = pipeline.fit(df).transform(df)

# Split the data into training and test sets
train_df, test_df = df_preprocessed.randomSplit([0.8, 0.2], seed=42)

# Initialize the Random Forest classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)

# Train the model
rf_model = rf.fit(train_df)

# Make predictions on the test set
predictions = rf_model.transform(test_df)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Test set accuracy: {accuracy}")
print(f"Test set precision: {precision}")
print(f"Test set recall: {recall}")
print(f"Test set F1-score: {f1}")


In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Define the parameter grid
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [50, 100, 150])
             .addGrid(rf.maxDepth, [5, 10, 15])
             .build())

# Define the cross-validator
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)

# Perform cross-validation
cv_model = crossval.fit(train_df)
cv_predictions = cv_model.transform(test_df)


cv_accuracy = evaluator.evaluate(cv_predictions)
precision = evaluator.evaluate(cv_predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(cv_predictions, {evaluator.metricName: "weightedRecall"})
f1 = evaluator.evaluate(cv_predictions, {evaluator.metricName: "f1"})


print(f"Cross-validated accuracy: {cv_accuracy}")
print(f"CV set precision: {precision}")
print(f"CV set recall: {recall}")
print(f"CV set F1-score: {f1}")

In [0]:


evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Test set accuracy: {accuracy}")
print(f"Test set precision: {precision}")
print(f"Test set recall: {recall}")
print(f"Test set F1-score: {f1}")