<a href="https://colab.research.google.com/github/amien1410/colab-notebooks/blob/main/Colab_Pyspark_Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Download the dataset
!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
!unzip -q "/content/bank+marketing.zip"
!unzip -q "/content/bank.zip"

--2025-06-27 13:51:47--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: â€˜bank+marketing.zipâ€™

bank+marketing.zip      [  <=>               ] 999.85K  2.52MB/s    in 0.4s    

2025-06-27 13:51:47 (2.52 MB/s) - â€˜bank+marketing.zipâ€™ saved [1023843]



In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# 1. Start Spark session
spark = SparkSession.builder.appName("BankRandomForestRegression").getOrCreate()

# 2. Load dataset
filename = "bank-full.csv"
data = spark.read.csv(filename, header=True, inferSchema=True, sep=';')
data.printSchema()

# 3. Define columns based on your schema
numeric_cols = ['age', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
target_col = 'balance'

# 4. Index + OneHotEncode categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid='keep') for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col}_index", outputCol=f"{col}_encoded") for col in categorical_cols]

# 5. Assemble features
assembler_inputs = numeric_cols + [f"{col}_encoded" for col in categorical_cols]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

# 6. Build full pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler])

# 7. Transform dataset
pipeline_model = pipeline.fit(data)
processed_data = pipeline_model.transform(data).select(target_col, "features")

# 8. Train/test split
train_data, test_data = processed_data.randomSplit([0.8, 0.2], seed=42)

# 9. Train Random Forest Regressor
rf = RandomForestRegressor(featuresCol="features", labelCol=target_col, numTrees=100, maxDepth=10)
rf_model = rf.fit(train_data)

# 10. Predict
predictions = rf_model.transform(test_data)

# 11. Evaluate performance
evaluator_rmse = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print("\nðŸŒ² Random Forest Performance:")
print(f"âœ… RMSE: {rmse:.2f}")
print(f"âœ… RÂ²: {r2:.4f}")

# 12. Show predictions
predictions.select(target_col, "prediction").show(5)

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)


ðŸŒ² Random Forest Performance:
âœ… RMSE: 2812.73
âœ… RÂ²: 0.0396
+-------+------------------+
|balance|        prediction|
+-------+------------------+
|  -4057| 2267.919843746184|
|  -2827|-218.8666096320837|
|  -2604|1232.9160974044155|
|  -2049|1660.6567709733022|
|  -1884|1273.2068800267814|
+-------+------------------+
only showing to

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# 1. Start Spark session
spark = SparkSession.builder.appName("BankYClassifier").getOrCreate()

# 2. Load dataset
filename = "bank-full.csv"
data = spark.read.csv(filename, header=True, inferSchema=True, sep=';')

# 3. Define columns
numeric_cols = ['age', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
label_col = 'y'

# 4. Index label column
label_indexer = StringIndexer(inputCol=label_col, outputCol="label", handleInvalid='keep')

# 5. Index and encode categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid='keep') for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col}_index", outputCol=f"{col}_encoded") for col in categorical_cols]

# 6. Assemble all features
assembler_inputs = numeric_cols + [f"{col}_encoded" for col in categorical_cols]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

# 7. Build the pipeline
pipeline = Pipeline(stages=[label_indexer] + indexers + encoders + [assembler])

# 8. Transform the data
pipeline_model = pipeline.fit(data)
processed_data = pipeline_model.transform(data).select("label", "features")

# 9. Split into train/test
train_data, test_data = processed_data.randomSplit([0.8, 0.2], seed=42)

# 10. Train Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100, maxDepth=10)
rf_model = rf.fit(train_data)

# 11. Make predictions
predictions = rf_model.transform(test_data)

# 12. Evaluate model
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

accuracy = evaluator_accuracy.evaluate(predictions)
f1 = evaluator_f1.evaluate(predictions)

print(f"\nðŸŒ² Random Forest Classification Performance:")
print(f"âœ… Accuracy: {accuracy:.4f}")
print(f"âœ… F1 Score: {f1:.4f}")

# 13. Show sample predictions
predictions.select("label", "prediction", "probability").show(5)


ðŸŒ² Random Forest Classification Performance:
âœ… Accuracy: 0.9000
âœ… F1 Score: 0.8760
+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|  0.0|       0.0|[0.96666177920040...|
|  0.0|       0.0|[0.92786810145475...|
|  0.0|       0.0|[0.96686278228263...|
|  0.0|       0.0|[0.96709225131151...|
|  0.0|       0.0|[0.96794829091001...|
+-----+----------+--------------------+
only showing top 5 rows

