<a href="https://colab.research.google.com/github/ankesh86/PySparkNotebooks/blob/main/ClassificationModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Initialise Spark**

In [1]:
!pip install pyspark==3.4.0

Collecting pyspark==3.4.0
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317122 sha256=4875fd0591714755c696830468bb2ca56653af0356383e6dce78f78d26dd3535
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('binary_class').getOrCreate()

# **Load the Dataset**

In [18]:
df = spark.read.csv('sample_data/classification_data.csv', inferSchema = True, header=True)

# **Exploring Dataframe**

In [5]:
print((df.count(), len(df.columns)))

(46751, 12)


In [6]:
df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- is_first_loan: integer (nullable = true)
 |-- total_credit_card_limit: integer (nullable = true)
 |-- avg_percentage_credit_card_limit_used_last_year: double (nullable = true)
 |-- saving_amount: integer (nullable = true)
 |-- checking_amount: integer (nullable = true)
 |-- is_employed: integer (nullable = true)
 |-- yearly_salary: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- dependent_number: integer (nullable = true)
 |-- label: integer (nullable = true)



In [12]:
df.show(5)

+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+-----+
|loan_id|loan_purpose|is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year|saving_amount|checking_amount|is_employed|yearly_salary|age|dependent_number|label|
+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+-----+
|    A_1|    personal|            1|                   7900|                                            0.8|         1103|           6393|          1|        16400| 42|               4|    0|
|    A_2|    personal|            0|                   3300|                                           0.29|         2588|            832|          1|        75500| 56|               1|    0|
|    A_3|    personal|            0|    

In [11]:
df.summary().show()

+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|summary|loan_id|loan_purpose|     is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year|     saving_amount|   checking_amount|       is_employed|     yearly_salary|               age|  dependent_number|              label|
+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|  count|  46751|       46751|             46751|                  46751|                                          46751|             46751|             46751|             46751|             46751|             46751|             467

In [14]:
df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1|16201|
|    0|30550|
+-----+-----+



In [15]:
df.groupBy('loan_purpose').count().show()

+------------+-----+
|loan_purpose|count|
+------------+-----+
|      others| 6763|
|   emergency| 7562|
|    property|11388|
|  operations|10580|
|    personal|10458|
+------------+-----+



# **Data Transformation**

In [19]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

# Initialize StringIndexer and transform data
loan_purpose_indexer = StringIndexer(inputCol="loan_purpose", outputCol="loan_index")
df = loan_purpose_indexer.fit(df).transform(df)

# Initialize OneHotEncoder
loan_encoder = OneHotEncoder(inputCol="loan_index", outputCol='loan_purpose_vec')

# Fit and transform using OneHotEncoder
loan_encoder_model = loan_encoder.fit(df)
df = loan_encoder_model.transform(df)

# Show the results
df.select(['loan_purpose', 'loan_index', 'loan_purpose_vec']).show(3, False)


+------------+----------+----------------+
|loan_purpose|loan_index|loan_purpose_vec|
+------------+----------+----------------+
|personal    |2.0       |(4,[2],[1.0])   |
|personal    |2.0       |(4,[2],[1.0])   |
|personal    |2.0       |(4,[2],[1.0])   |
+------------+----------+----------------+
only showing top 3 rows



In [20]:
from pyspark.ml.feature import VectorAssembler

df_assembler = VectorAssembler(inputCols=['is_first_loan','total_credit_card_limit'
                              ,'avg_percentage_credit_card_limit_used_last_year'
                              ,'saving_amount','checking_amount','is_employed'
                              ,'yearly_salary','age','dependent_number','loan_purpose_vec'], outputCol="features")
df = df_assembler.transform(df)
df.select(['features','label']).show(10,False)

+--------------------------------------------------------------------+-----+
|features                                                            |label|
+--------------------------------------------------------------------+-----+
|[1.0,7900.0,0.8,1103.0,6393.0,1.0,16400.0,42.0,4.0,0.0,0.0,1.0,0.0] |0    |
|[0.0,3300.0,0.29,2588.0,832.0,1.0,75500.0,56.0,1.0,0.0,0.0,1.0,0.0] |0    |
|[0.0,7600.0,0.9,1651.0,8868.0,1.0,59000.0,46.0,1.0,0.0,0.0,1.0,0.0] |0    |
|[1.0,3400.0,0.38,1269.0,6863.0,1.0,26000.0,55.0,8.0,0.0,0.0,1.0,0.0]|0    |
|[0.0,2600.0,0.89,1310.0,3423.0,1.0,9700.0,41.0,4.0,0.0,0.0,0.0,1.0] |1    |
|[0.0,7600.0,0.51,1040.0,2406.0,1.0,22900.0,52.0,0.0,0.0,1.0,0.0,0.0]|0    |
|[1.0,6900.0,0.82,2408.0,5556.0,1.0,34800.0,48.0,4.0,0.0,1.0,0.0,0.0]|0    |
|[0.0,5700.0,0.56,1933.0,4139.0,1.0,32500.0,64.0,2.0,0.0,0.0,1.0,0.0]|0    |
|[1.0,3400.0,0.95,3866.0,4131.0,1.0,13300.0,23.0,3.0,0.0,0.0,1.0,0.0]|0    |
|[0.0,2900.0,0.91,88.0,2725.0,1.0,21100.0,52.0,1.0,0.0,0.0,1.0,0.0]  |1    |

In [21]:
model_df = df.select(['features','label'])

# **Splitting into Train and Test Data**

In [22]:
training_df, test_df = model_df.randomSplit([0.75,0.25])

# **Model Training**

In [23]:
from pyspark.ml.classification import LogisticRegression

log_reg = LogisticRegression().fit(training_df)
lr_summary = log_reg.summary

lr_summary.accuracy

0.895307582260372

In [24]:
lr_summary.areaUnderROC

0.9591584712788663

In [25]:
print(lr_summary.precisionByLabel)

[0.924433249370277, 0.8418147877607337]


In [26]:
print(lr_summary.recallByLabel)

[0.9147717334266223, 0.8584671412017878]


In [27]:
predictions = log_reg.transform(test_df)
predictions.show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(13,[0,1,2,3,4,7]...|    1|[-6.9114082572908...|[9.95361953148917...|       1.0|
|(13,[0,1,2,3,4,7]...|    1|[-1.8519132930933...|[0.13564841095108...|       1.0|
|(13,[0,1,2,3,4,7]...|    1|[-1.5474300022587...|[0.17545776568584...|       1.0|
|(13,[0,1,2,3,4,7]...|    0|[6.55941888405937...|[0.99858529548104...|       0.0|
|(13,[0,1,2,3,4,7,...|    1|[-4.5229157037258...|[0.01074070545814...|       1.0|
|(13,[0,1,2,3,4,7,...|    1|[-6.0246200201574...|[0.00241263547538...|       1.0|
|(13,[0,1,2,3,4,7,...|    1|[-6.2625121325886...|[0.00190282288181...|       1.0|
|(13,[0,1,2,3,4,7,...|    1|[-6.5829963322610...|[0.00138178508909...|       1.0|
|(13,[0,1,2,3,4,7,...|    1|[-4.3341657372251...|[0.01294308878783...|       1.0|
|(13,[0,1,2,3,4,

In [28]:
model_prediction = log_reg.transform(test_df)
model_prediction = log_reg.evaluate(test_df)
model_prediction.accuracy

0.8900093212439624

In [29]:
model_prediction.areaUnderROC

0.9582228873264587

In [30]:
print(model_prediction.recallByLabel)

[0.9065347565738089, 0.8591891235736829]


In [31]:
print(model_prediction.precisionByLabel)

[0.9231177094379639, 0.8313366220342965]
