In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("LoanPredictionMLP").getOrCreate()

# Step 2: Load Dataset
data = spark.read.csv("C:/Users/yamin/OneDrive/Desktop/BD/CreditRisk (1).csv", header=True, inferSchema=True)

# Step 3: Data Preprocessing
# Replace nulls in numerical columns with median or mean values
numerical_columns = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term", "Credit_History"]
for col_name in numerical_columns:
    median_value = data.approxQuantile(col_name, [0.5], 0.01)[0]
    data = data.fillna({col_name: median_value})

# Replace nulls in categorical columns with "Unknown"
categorical_columns = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area"]
for col_name in categorical_columns:
    data = data.withColumn(col_name, when(col(col_name).isNull(), "Unknown").otherwise(col(col_name)))

# Encode categorical columns using StringIndexer
for col_name in categorical_columns:
    indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed")
    data = indexer.fit(data).transform(data)

# Drop unnecessary columns
data = data.drop(*categorical_columns)

# Convert target variable to numerical
label_indexer = StringIndexer(inputCol="Loan_Status", outputCol="label")
data = label_indexer.fit(data).transform(data)
data = data.drop("Loan_Status")

# Step 4: Assemble Features
feature_cols = [col for col in data.columns if col != "label" and col != "Loan_ID"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

# Step 5: Train-Test Split
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Step 6: Define the Model
layers = [len(feature_cols), 10, 5, 2]  # Adjust the layers as per your requirements
mlp = MultilayerPerceptronClassifier(layers=layers, featuresCol="features", labelCol="label", maxIter=100)

# Step 7: Train the Model
mlp_model = mlp.fit(train_data)

# Step 8: Make Predictions
predictions = mlp_model.transform(test_data)

# Step 9: Evaluate the Model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Test Accuracy: {accuracy:.2f}")


Test Accuracy: 0.69


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("LoanPredictionMLP").getOrCreate()

# Step 2: Load Dataset
data = spark.read.csv("C:/Users/yamin/OneDrive/Desktop/BD/CreditRisk (1).csv", header=True, inferSchema=True)

# Step 3: Data Preprocessing
# Replace nulls in numerical columns with median values
numerical_columns = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term", "Credit_History"]
for col_name in numerical_columns:
    median_value = data.approxQuantile(col_name, [0.5], 0.01)[0]
    data = data.fillna({col_name: median_value})

# Replace nulls in categorical columns with "Unknown"
categorical_columns = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area"]
for col_name in categorical_columns:
    data = data.withColumn(col_name, when(col(col_name).isNull(), "Unknown").otherwise(col(col_name)))

# Encode categorical columns using StringIndexer
for col_name in categorical_columns:
    indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed")
    data = indexer.fit(data).transform(data)

# Drop unnecessary columns
data = data.drop(*categorical_columns)

# Convert target variable to numerical
label_indexer = StringIndexer(inputCol="Loan_Status", outputCol="label")
data = label_indexer.fit(data).transform(data)
data = data.drop("Loan_Status")

# Step 4: Assemble Features
feature_cols = [col for col in data.columns if col != "label" and col != "Loan_ID"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

# Step 5: Train-Test Split
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Step 6: Define the Model
layers = [len(feature_cols), 10, 5, 2]  # Adjust the layers as per your requirements
mlp = MultilayerPerceptronClassifier(layers=layers, featuresCol="features", labelCol="label", maxIter=100)

# Step 7: Train the Model
mlp_model = mlp.fit(train_data)

# Step 8: Make Predictions
predictions = mlp_model.transform(test_data)

# Step 9: Evaluate the Model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

# Calculate metrics
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
f1_score = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Test Accuracy: {accuracy:.2f}")
print(f"Test Precision: {precision:.2f}")
print(f"Test Recall: {recall:.2f}")
print(f"Test F1-Score: {f1_score:.2f}")


Test Accuracy: 0.69
Test Precision: 0.48
Test Recall: 0.69
Test F1-Score: 0.57


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("LoanPredictionMLP").getOrCreate()

# Step 2: Load Dataset
data = spark.read.csv("C:/Users/yamin/OneDrive/Desktop/BD/CreditRisk (1).csv", header=True, inferSchema=True)

# Step 3: Data Preprocessing
# Replace nulls in numerical columns with median values
numerical_columns = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term", "Credit_History"]
for col_name in numerical_columns:
    median_value = data.approxQuantile(col_name, [0.5], 0.01)[0]
    data = data.fillna({col_name: median_value})

# Replace nulls in categorical columns with "Unknown"
categorical_columns = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area"]
for col_name in categorical_columns:
    data = data.withColumn(col_name, when(col(col_name).isNull(), "Unknown").otherwise(col(col_name)))

# Encode categorical columns using StringIndexer
for col_name in categorical_columns:
    indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed")
    data = indexer.fit(data).transform(data)

# Drop unnecessary columns
data = data.drop(*categorical_columns)

# Convert target variable to numerical
label_indexer = StringIndexer(inputCol="Loan_Status", outputCol="label")
data = label_indexer.fit(data).transform(data)
data = data.drop("Loan_Status")

# Step 4: Assemble Features
feature_cols = [col for col in data.columns if col != "label" and col != "Loan_ID"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

# Step 5: Train-Test Split
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Step 6: Define the Model
layers = [len(feature_cols), 10, 5, 2]  # Adjust the layers as per your requirements
mlp = MultilayerPerceptronClassifier(layers=layers, featuresCol="features", labelCol="label", maxIter=100)

# Step 7: Train the Model
mlp_model = mlp.fit(train_data)

# Step 8: Make Predictions
predictions = mlp_model.transform(test_data)

# Step 9: Display required columns from the predictions
predictions.select(
    'ApplicantIncome', 
    'CoapplicantIncome', 
    'Loan_Amount_Term', 
    'Credit_History', 
    'rawPrediction', 
    'prediction'
).show(10)


+---------------+-----------------+----------------+--------------+--------------------+----------+
|ApplicantIncome|CoapplicantIncome|Loan_Amount_Term|Credit_History|       rawPrediction|prediction|
+---------------+-----------------+----------------+--------------+--------------------+----------+
|           3000|              0.0|             360|             1|[0.25383782538227...|       0.0|
|           2333|           1516.0|             360|             1|[0.25383782538227...|       0.0|
|           5720|              0.0|             360|             1|[0.25383782538227...|       0.0|
|           2500|           1840.0|             360|             1|[0.25383782538227...|       0.0|
|           3596|              0.0|             240|             1|[0.25383782538227...|       0.0|
|           2600|           3500.0|             360|             1|[0.25383782538227...|       0.0|
|           3717|           2925.0|             360|             1|[0.25383782538227...|       0.0|


In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("LoanPredictionMLPImproved").getOrCreate()

# Step 2: Load Dataset
data = spark.read.csv("C:/Users/yamin/OneDrive/Desktop/BD/CreditRisk (1).csv", header=True, inferSchema=True)

# Step 3: Data Preprocessing (same as before)
# Replace nulls in numerical columns with median values
numerical_columns = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term", "Credit_History"]
for col_name in numerical_columns:
    median_value = data.approxQuantile(col_name, [0.5], 0.01)[0]
    data = data.fillna({col_name: median_value})

# Replace nulls in categorical columns with "Unknown"
categorical_columns = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area"]
for col_name in categorical_columns:
    data = data.withColumn(col_name, when(col(col_name).isNull(), "Unknown").otherwise(col(col_name)))

# Encode categorical columns using StringIndexer
for col_name in categorical_columns:
    indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_indexed")
    data = indexer.fit(data).transform(data)

# Drop unnecessary columns
data = data.drop(*categorical_columns)

# Convert target variable to numerical
label_indexer = StringIndexer(inputCol="Loan_Status", outputCol="label")
data = label_indexer.fit(data).transform(data)
data = data.drop("Loan_Status")

# Step 4: Assemble Features
feature_cols = [col for col in data.columns if col != "label" and col != "Loan_ID"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

# Step 5: Train-Test Split
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Step 6: Define the Model (with larger architecture)
layers = [len(feature_cols), 128, 64, 32, 2]  # Adding more neurons and layers for complexity
mlp = MultilayerPerceptronClassifier(layers=layers, featuresCol="features", labelCol="label", maxIter=100)

# Step 7: Hyperparameter Tuning with Cross-Validation
paramGrid = ParamGridBuilder() \
    .addGrid(mlp.maxIter, [100, 200]) \
    .addGrid(mlp.layers, [[len(feature_cols), 128, 64, 32, 2], [len(feature_cols), 256, 128, 64, 2]]) \
    .addGrid(mlp.blockSize, [128, 256]) \
    .build()

# Evaluator to check accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# CrossValidator for hyperparameter tuning
cv = CrossValidator(estimator=mlp, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)

# Step 8: Train the Model
cv_model = cv.fit(train_data)

# Step 9: Make Predictions
predictions = cv_model.transform(test_data)

# Step 10: Evaluate the Model
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.2f}")

# Additional evaluation metrics (Precision, Recall)
precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")

precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

# Step 11: Display required columns from the predictions
predictions.select(
    'ApplicantIncome', 
    'CoapplicantIncome', 
    'Loan_Amount_Term', 
    'Credit_History', 
    'rawPrediction', 
    'prediction'
).show(10)


Test Accuracy: 0.72
Precision: 0.80
Recall: 0.72
+---------------+-----------------+----------------+--------------+--------------------+----------+
|ApplicantIncome|CoapplicantIncome|Loan_Amount_Term|Credit_History|       rawPrediction|prediction|
+---------------+-----------------+----------------+--------------+--------------------+----------+
|           3000|              0.0|             360|             1|[-0.0996178605546...|       0.0|
|           2333|           1516.0|             360|             1|[0.08510741834182...|       0.0|
|           5720|              0.0|             360|             1|[-0.0709797313010...|       0.0|
|           2500|           1840.0|             360|             1|[0.15254882478354...|       0.0|
|           3596|              0.0|             240|             1|[-0.0857244793366...|       0.0|
|           2600|           3500.0|             360|             1|[-0.0537997316019...|       0.0|
|           3717|           2925.0|             360