<a href="https://colab.research.google.com/github/amien1410/colab-notebooks/blob/main/Colab_Pyspark_Neural_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Install Kaggle modules and download the dataset

from google.colab import drive
drive.mount('/content/drive')

!pip install kaggle
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/drive/MyDrive/kaggle'
!kaggle competitions download -c digit-recognizer
!unzip -q "/content/digit-recognizer.zip"

Mounted at /content/drive
Downloading digit-recognizer.zip to /content
  0% 0.00/15.3M [00:00<?, ?B/s]
100% 15.3M/15.3M [00:00<00:00, 433MB/s]


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.classification import MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

# 1. Start Spark session
spark = SparkSession.builder.appName("MNIST_Neural_Network").getOrCreate()

# 2. Load training data
train_df = spark.read.csv("/content/train.csv", header=True, inferSchema=True)

# 3. Assemble features
pixel_cols = [f"pixel{i}" for i in range(784)]
assembler = VectorAssembler(inputCols=pixel_cols, outputCol="features_raw")
train_df = assembler.transform(train_df)

# 4. Normalize features (0–1)
scaler = MinMaxScaler(inputCol="features_raw", outputCol="features")
scaler_model = scaler.fit(train_df)
scaled_train_df = scaler_model.transform(train_df).select("label", "features")

# 5. Train-test split
train_data, test_data = scaled_train_df.randomSplit([0.8, 0.2], seed=42)

# 6. Define neural network layers
layers = [784, 128, 64, 10]  # input → hidden1 → hidden2 → output (10 classes)

# 7. Train neural network model
mlp = MultilayerPerceptronClassifier(labelCol="label", featuresCol="features", maxIter=100, layers=layers, blockSize=128, seed=123)
mlp_model = mlp.fit(train_data)

# 8. Evaluate accuracy
predictions = mlp_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"✅ Test Accuracy: {accuracy:.4f}")

# 9. Confusion matrix
pred_rdd = predictions.select("prediction", "label").rdd.map(lambda row: (float(row[0]), float(row[1])))
metrics = MulticlassMetrics(pred_rdd)
print("\n📊 Confusion Matrix:")
print(metrics.confusionMatrix().toArray())

# 10. Save the model
mlp_model.write().overwrite().save("mlp_mnist_model")
print("💾 Model saved as: mlp_mnist_model")

# 11. Load test data
test_df = spark.read.csv("/content/test.csv", header=True, inferSchema=True)

# 12. Assemble and scale test features
test_df = assembler.transform(test_df)
test_scaled = scaler_model.transform(test_df).select("features")

# 13. Load model and predict test labels
loaded_model = MultilayerPerceptronClassificationModel.load("mlp_mnist_model")
test_predictions = loaded_model.transform(test_scaled).select("prediction")

# 14. Add ImageId for submission
test_predictions = test_predictions.withColumn("ImageId", (col("prediction").rdd.zipWithIndex().map(lambda x: x[1] + 1)).toDF("ImageId"))
submission = test_predictions.selectExpr("ImageId", "int(prediction) as Label")

# 15. Export submission file
submission.coalesce(1).write.csv("submission", header=True, mode="overwrite")
print("📤 Submission saved to ./submission/")
