<a href="https://colab.research.google.com/github/ankesh86/PySparkNotebooks/blob/main/ClassificationModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Initialise Spark**

In [1]:
!pip install pyspark==3.4.0

Collecting pyspark==3.4.0
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317122 sha256=9213b78e8126adbd8c2895d59fa817ab34d42b5eeab9654bc3237613d6d61161
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('binary_class').getOrCreate()

# **Load the Dataset**

In [3]:
df = spark.read.csv('sample_data/classification_data.csv', inferSchema = True, header=True)

# **Exploring Dataframe**

In [4]:
print((df.count(), len(df.columns)))

(46751, 12)


In [5]:
df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- is_first_loan: integer (nullable = true)
 |-- total_credit_card_limit: integer (nullable = true)
 |-- avg_percentage_credit_card_limit_used_last_year: double (nullable = true)
 |-- saving_amount: integer (nullable = true)
 |-- checking_amount: integer (nullable = true)
 |-- is_employed: integer (nullable = true)
 |-- yearly_salary: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- dependent_number: integer (nullable = true)
 |-- label: integer (nullable = true)



In [6]:
df.show(5)

+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+-----+
|loan_id|loan_purpose|is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year|saving_amount|checking_amount|is_employed|yearly_salary|age|dependent_number|label|
+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+-----+
|    A_1|    personal|            1|                   7900|                                            0.8|         1103|           6393|          1|        16400| 42|               4|    0|
|    A_2|    personal|            0|                   3300|                                           0.29|         2588|            832|          1|        75500| 56|               1|    0|
|    A_3|    personal|            0|    

In [7]:
df.summary().show()

+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|summary|loan_id|loan_purpose|     is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year|     saving_amount|   checking_amount|       is_employed|     yearly_salary|               age|  dependent_number|              label|
+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|  count|  46751|       46751|             46751|                  46751|                                          46751|             46751|             46751|             46751|             46751|             46751|             467

In [8]:
df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1|16201|
|    0|30550|
+-----+-----+



In [9]:
df.groupBy('loan_purpose').count().show()

+------------+-----+
|loan_purpose|count|
+------------+-----+
|      others| 6763|
|   emergency| 7562|
|    property|11388|
|  operations|10580|
|    personal|10458|
+------------+-----+



# **Data Transformation**

In [10]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

# Initialize StringIndexer and transform data
loan_purpose_indexer = StringIndexer(inputCol="loan_purpose", outputCol="loan_index")
df = loan_purpose_indexer.fit(df).transform(df)

# Initialize OneHotEncoder
loan_encoder = OneHotEncoder(inputCol="loan_index", outputCol='loan_purpose_vec')

# Fit and transform using OneHotEncoder
loan_encoder_model = loan_encoder.fit(df)
df = loan_encoder_model.transform(df)

# Show the results
df.select(['loan_purpose', 'loan_index', 'loan_purpose_vec']).show(3, False)


+------------+----------+----------------+
|loan_purpose|loan_index|loan_purpose_vec|
+------------+----------+----------------+
|personal    |2.0       |(4,[2],[1.0])   |
|personal    |2.0       |(4,[2],[1.0])   |
|personal    |2.0       |(4,[2],[1.0])   |
+------------+----------+----------------+
only showing top 3 rows



In [11]:
from pyspark.ml.feature import VectorAssembler

df_assembler = VectorAssembler(inputCols=['is_first_loan','total_credit_card_limit'
                              ,'avg_percentage_credit_card_limit_used_last_year'
                              ,'saving_amount','checking_amount','is_employed'
                              ,'yearly_salary','age','dependent_number','loan_purpose_vec'], outputCol="features")
df = df_assembler.transform(df)
df.select(['features','label']).show(10,False)

+--------------------------------------------------------------------+-----+
|features                                                            |label|
+--------------------------------------------------------------------+-----+
|[1.0,7900.0,0.8,1103.0,6393.0,1.0,16400.0,42.0,4.0,0.0,0.0,1.0,0.0] |0    |
|[0.0,3300.0,0.29,2588.0,832.0,1.0,75500.0,56.0,1.0,0.0,0.0,1.0,0.0] |0    |
|[0.0,7600.0,0.9,1651.0,8868.0,1.0,59000.0,46.0,1.0,0.0,0.0,1.0,0.0] |0    |
|[1.0,3400.0,0.38,1269.0,6863.0,1.0,26000.0,55.0,8.0,0.0,0.0,1.0,0.0]|0    |
|[0.0,2600.0,0.89,1310.0,3423.0,1.0,9700.0,41.0,4.0,0.0,0.0,0.0,1.0] |1    |
|[0.0,7600.0,0.51,1040.0,2406.0,1.0,22900.0,52.0,0.0,0.0,1.0,0.0,0.0]|0    |
|[1.0,6900.0,0.82,2408.0,5556.0,1.0,34800.0,48.0,4.0,0.0,1.0,0.0,0.0]|0    |
|[0.0,5700.0,0.56,1933.0,4139.0,1.0,32500.0,64.0,2.0,0.0,0.0,1.0,0.0]|0    |
|[1.0,3400.0,0.95,3866.0,4131.0,1.0,13300.0,23.0,3.0,0.0,0.0,1.0,0.0]|0    |
|[0.0,2900.0,0.91,88.0,2725.0,1.0,21100.0,52.0,1.0,0.0,0.0,1.0,0.0]  |1    |

In [12]:
model_df = df.select(['features','label'])

# **Splitting into Train and Test Data**

In [13]:
training_df, test_df = model_df.randomSplit([0.75,0.25])

# **Model Training**

In [14]:
from pyspark.ml.classification import LogisticRegression

log_reg = LogisticRegression().fit(training_df)
lr_summary = log_reg.summary

lr_summary.accuracy

0.8963587820949784

In [15]:
lr_summary.areaUnderROC

0.9600074413214449

In [16]:
print(lr_summary.precisionByLabel)

[0.9259194242078469, 0.8421349219127354]


In [17]:
print(lr_summary.recallByLabel)

[0.9149572834901774, 0.8610585233352539]


In [18]:
predictions = log_reg.transform(test_df)
predictions.show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(13,[0,1,2,3,4,7]...|    1|[-6.9352418574935...|[9.71942149466226...|       1.0|
|(13,[0,1,2,3,4,7]...|    1|[-4.7714837807804...|[0.00839670479432...|       1.0|
|(13,[0,1,2,3,4,7]...|    1|[-1.5057048148822...|[0.18157621191176...|       1.0|
|(13,[0,1,2,3,4,7,...|    1|[-4.5246513737078...|[0.01072227902198...|       1.0|
|(13,[0,1,2,3,4,7,...|    1|[-6.0530753961897...|[0.00234510932058...|       1.0|
|(13,[0,1,2,3,4,7,...|    1|[-7.0340939226962...|[8.80540309436932...|       1.0|
|(13,[0,1,2,3,4,7,...|    1|[-4.1066928289795...|[0.01619551503455...|       1.0|
|(13,[0,1,2,3,4,7,...|    1|[-4.3377103858746...|[0.01289788199217...|       1.0|
|(13,[0,1,2,3,4,7,...|    1|[-5.1365342657394...|[0.00584367684403...|       1.0|
|(13,[0,1,2,3,4,

# **Summary**
* A logistic regression model is trained using a dataset (likely split into a
training set, training_df).
* The summary of the trained model (lr_summary) is then used to output the accuracy of the model on the training data, which in this case is approximately 89.53%.
* The area under the ROC (Receiver Operating Characteristic) curve (areaUnderROC) is accessed from the summary, giving a value of approximately 0.959, indicating very good discriminative ability.
* Precision by label and recall by label are printed, showing precision and recall for each class label (which in binary classification are typically '0' and '1').

In [19]:
model_prediction = log_reg.transform(test_df)
model_prediction = log_reg.evaluate(test_df)
model_prediction.accuracy

0.8879840596032227

In [20]:
model_prediction.areaUnderROC

0.9554712117192821

In [21]:
print(model_prediction.recallByLabel)

[0.9100253637698572, 0.8472359328726555]


In [22]:
print(model_prediction.precisionByLabel)

[0.9167563206024745, 0.835889943998052]


# **Summary**
* The logistic regression model is then applied to a test dataset (test_df) to make predictions, which are stored in predictions.
* An evaluator is likely created and used to evaluate the predictions, providing the accuracy on the test data, which is around 89.91%.
* The evaluator also provides the area under the ROC curve for the test data, which is around 0.958, similar to the training performance.
* Finally, recall and precision by label are printed for the test data, showing how well the model performs for each class label.

# **Hyper-parameter Tuning**

## **Random Forest Classifier**

In [23]:
from pyspark.ml.classification import RandomForestClassifier

In [24]:
rf = RandomForestClassifier()
rf_model = rf.fit(training_df)
model_predictions = rf_model.transform(test_df)


In [25]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()

rf = RandomForestClassifier()

paramGrid = (ParamGridBuilder()
            .addGrid(rf.maxDepth, [5,10,20,25,30])
            .addGrid(rf.maxBins, [20,30,40])
            .addGrid(rf.numTrees, [5,20,50])
            .build())

cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
cv_model = cv.fit(training_df)

In [26]:
best_rf_model = cv_model.bestModel

In [27]:
# Generate predictions for entire dataset
model_predictions = best_rf_model.transform(test_df)

In [28]:
true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()
actual_pos=model_predictions.filter(model_predictions['label']==1).count()
pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()

In [29]:
#Recall
float(true_pos)/(actual_pos)

0.9072063178677197

In [30]:
#Precision on test Data
float(true_pos)/(pred_pos)

0.8452517820188549