In [1]:
import os
import sys

os.environ["JAVA_HOME"] = "../.JDK 8" 
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, OneHotEncoder, VectorAssembler, StringIndexer, IDF,HashingTF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator,TrainValidationSplit, ParamGridBuilder




In [4]:
# Create a new Sparksession
spark = SparkSession\
    .builder\
    .appName('credit_card_analysis')\
    .getOrCreate()
    
spark

25/02/22 15:43:19 WARN Utils: Your hostname, Mint-T470 resolves to a loopback address: 127.0.1.1; using 10.46.79.254 instead (on interface wlp4s0)
25/02/22 15:43:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/22 15:43:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [59]:
path = "Credit_card.csv"

df = spark.read.csv(path, header=True, inferSchema=True)
df.show(5)


+-------+------+---------+-------------+--------+-------------+--------------------+----------------+--------------+-----------------+--------------+-------------+------------+----------+-----+--------+---------------+--------------+
| Ind_ID|GENDER|Car_Owner|Propert_Owner|CHILDREN|Annual_income|         Type_Income|       EDUCATION|Marital_status|     Housing_type|Birthday_count|Employed_days|Mobile_phone|Work_Phone|Phone|EMAIL_ID|Type_Occupation|Family_Members|
+-------+------+---------+-------------+--------+-------------+--------------------+----------------+--------------+-----------------+--------------+-------------+------------+----------+-----+--------+---------------+--------------+
|5008827|     M|        Y|            Y|       0|     180000.0|           Pensioner|Higher education|       Married|House / apartment|        -18772|       365243|           1|         0|    0|       0|           NULL|             2|
|5009744|     F|        Y|            N|       0|     315000.0|C

In [60]:
null_count = df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_count.show()


+------+------+---------+-------------+--------+-------------+-----------+---------+--------------+------------+--------------+-------------+------------+----------+-----+--------+---------------+--------------+
|Ind_ID|GENDER|Car_Owner|Propert_Owner|CHILDREN|Annual_income|Type_Income|EDUCATION|Marital_status|Housing_type|Birthday_count|Employed_days|Mobile_phone|Work_Phone|Phone|EMAIL_ID|Type_Occupation|Family_Members|
+------+------+---------+-------------+--------+-------------+-----------+---------+--------------+------------+--------------+-------------+------------+----------+-----+--------+---------------+--------------+
|     0|     7|        0|            0|       0|           23|          0|        0|             0|           0|            22|            0|           0|         0|    0|       0|            488|             0|
+------+------+---------+-------------+--------+-------------+-----------+---------+--------------+------------+--------------+-------------+-----------

NULL Occupation have been converted to N/A so that I can perserve data 

In [61]:
df = df.na.fill(value="N/A",subset=["Type_Occupation"])


In [62]:
df = df.na.drop()

In [63]:
categorical_features = ['GENDER', 'Car_Owner', 'Propert_Owner', 'Type_Income', 'EDUCATION', 'Marital_status', 'Housing_type', 'Type_Occupation', 'Family_Members']

numeric_features = ['CHILDREN', 'Annual_income','Birthday_count', 'Employed_days']

drop_columns = ['Mobile_phone', 'Work_Phone', 'Phone', 'EMAIL_ID', 'Ind_ID']

In [64]:
df = df.drop(*drop_columns)

In [65]:
stages = []

for cat_feature in categorical_features:
    string_indexer = StringIndexer(inputCol = cat_feature,
                                    outputCol = f"{cat_feature}Index")
    encoder = OneHotEncoder(inputCol=f"{cat_feature}Index",
                            outputCol=f"{cat_feature}Vector")
    stages+= [string_indexer, encoder]

In [66]:
all_features = numeric_features + [f"{cat}Vector" for cat in categorical_features]

assembler = VectorAssembler(inputCols=all_features,
                            outputCol="features")

stages += [assembler]

In [67]:
pipeline = Pipeline(stages=stages)


In [68]:
transformed_data = pipeline.fit(df).transform(df)

transformed_data.show(5)

+------+---------+-------------+--------+-------------+--------------------+--------------------+--------------+-----------------+--------------+-------------+---------------+--------------+-----------+-------------+--------------+---------------+------------------+-------------------+----------------+-----------------+--------------+---------------+-------------------+--------------------+-----------------+------------------+--------------------+---------------------+-------------------+--------------------+--------------------+
|GENDER|Car_Owner|Propert_Owner|CHILDREN|Annual_income|         Type_Income|           EDUCATION|Marital_status|     Housing_type|Birthday_count|Employed_days|Type_Occupation|Family_Members|GENDERIndex| GENDERVector|Car_OwnerIndex|Car_OwnerVector|Propert_OwnerIndex|Propert_OwnerVector|Type_IncomeIndex|Type_IncomeVector|EDUCATIONIndex|EDUCATIONVector|Marital_statusIndex|Marital_statusVector|Housing_typeIndex|Housing_typeVector|Type_OccupationIndex|Type_Occupati

In [69]:
final_data = transformed_data.select("features", "Annual_income")
final_data.show()


+--------------------+-------------+
|            features|Annual_income|
+--------------------+-------------+
|(47,[1,2,3,6,9,11...|     180000.0|
|(47,[1,2,3,4,8,11...|     315000.0|
|(47,[1,2,3,4,8,11...|     315000.0|
|(47,[1,2,3,4,8,11...|     315000.0|
|(47,[1,2,3,4,5,9,...|     180000.0|
|(47,[0,1,2,3,6,8,...|     450000.0|
|(47,[0,1,2,3,6,9,...|     450000.0|
|(47,[0,1,2,3,6,8,...|     450000.0|
|(47,[1,2,3,4,6,7,...|      90000.0|
|(47,[1,2,3,4,6,7,...|      90000.0|
|(47,[0,1,2,3,5,9,...|     472500.0|
|(47,[1,2,3,5,6,7,...|     270000.0|
|(47,[1,2,3,5,6,7,...|     270000.0|
|(47,[1,2,3,4,5,6,...|     126000.0|
|(47,[1,2,3,4,5,6,...|     126000.0|
|(47,[1,2,3,5,6,9,...|      90000.0|
|(47,[1,2,3,5,7,11...|     202500.0|
|(47,[1,2,3,6,8,10...|     202500.0|
|(47,[1,2,3,5,6,8,...|     157500.0|
|(47,[0,1,2,3,4,5,...|     112500.0|
+--------------------+-------------+
only showing top 20 rows



In [70]:
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

In [71]:
linear_reg = LinearRegression(featuresCol="features",
                              labelCol="Annual_income")

lr_model = linear_reg.fit(train_data)




25/02/22 16:24:10 WARN Instrumentation: [eaab03a5] regParam is zero, which might cause numerical instability and overfitting.
25/02/22 16:24:10 WARN Instrumentation: [eaab03a5] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.


In [72]:
linear_pred = lr_model.transform(test_data)
# linear_pred.show(10)
linear_pred.select("Annual_income", "prediction").show(10)

+-------------+------------------+
|Annual_income|        prediction|
+-------------+------------------+
|     166500.0|166482.44809553307|
|     108000.0|107997.37562342436|
|     157500.0| 157499.5651360177|
|     270000.0| 269986.7398159321|
|     108000.0| 108005.4538816721|
|     126000.0|  126009.319571566|
|     144000.0|143994.86988744364|
|     225000.0|225008.38456975084|
|     135000.0|135001.56093459693|
|      54000.0|54018.335736256246|
+-------------+------------------+
only showing top 10 rows



In [73]:
linear_evaluator_r2 = RegressionEvaluator(labelCol="Annual_income",
                                       predictionCol="prediction",
                                       metricName="r2")

linear_r2 = linear_evaluator_r2.evaluate(linear_pred)

print(f"The coefficient of determination: {linear_r2}")

The coefficient of determination: 0.9999997427469859


In [74]:
linear_evaluator_rmse = RegressionEvaluator(labelCol="Annual_income",
                                       predictionCol="prediction",
                                       metricName="rmse")

linear_rmse = linear_evaluator_rmse.evaluate(linear_pred)

print(f"The root mean square error is: {linear_rmse}")

The root mean square error is: 47.19082566540446


The root mean square error is: 64.76089197606947
The coefficient of determination: 0.9999996562457443

with ind_ID


In [77]:
max_income = df.agg(F.max("Annual_income")).collect()[0][0]
min_income = df.agg(F.min("Annual_income")).collect()[0][0]

range_income = max_income - min_income

print("max income:", max_income, 
      "min income", min_income)
print("range income", range_income)

max income: 1575000.0 min income 33750.0
range income 1541250.0


In [76]:
df.withColumn("Annual_income/Family_Members", (F.col("Annual_income")/F.col("Family_Members"))).show()

+------+---------+-------------+--------+-------------+--------------------+--------------------+--------------------+-----------------+--------------+-------------+---------------+--------------+----------------------------+
|GENDER|Car_Owner|Propert_Owner|CHILDREN|Annual_income|         Type_Income|           EDUCATION|      Marital_status|     Housing_type|Birthday_count|Employed_days|Type_Occupation|Family_Members|Annual_income/Family_Members|
+------+---------+-------------+--------+-------------+--------------------+--------------------+--------------------+-----------------+--------------+-------------+---------------+--------------+----------------------------+
|     M|        Y|            Y|       0|     180000.0|           Pensioner|    Higher education|             Married|House / apartment|        -18772|       365243|            N/A|             2|                     90000.0|
|     F|        Y|            N|       0|     315000.0|Commercial associate|    Higher education