## Imports

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DateType
from pprint import pprint

## Variables

In [2]:
path = f'../data/raw/loans_raw.csv'

## Spark Session

In [3]:
spark = SparkSession.builder.appName('Loans').getOrCreate()

## Spark functions

In [4]:
def drop_na_cols(data, pct):
    rows = data.count()
    null_counts = data.select(
        [F.count(
            F.when(
                F.isnull(col), col)
        ).alias(col) for col in data.columns]
    )
    
    null_counts = null_counts.toPandas()
    null_counts = (null_counts/rows).ge(pct).all()
    null_cols = null_counts[null_counts == True].keys()
    
    
    
    return data.select([col for col in data.columns if col not in null_cols])

def lower_case_cols(data):
    data_dtypes = {col[0]: col[1] for col in data.dtypes}
    
    for column in data_dtypes.keys():
        if data_dtypes[column] == 'string':
            data = data.withColumn(column, F.lower(F.col(column)))
    
    return data
    
def remove_whitespace(data):
    data_dtypes = {col[0]: col[1] for col in data.dtypes}
    
    for column in data_dtypes.keys():
        if data_dtypes[column] == 'string':
            data = data.withColumn(column, F.lower(F.col(column)))
        
    return data

def make_col_numeric(data, column):
    return data.withColumn(column, data[column].cast(IntegerType()))

def truncate_credit_line(data, column):
    return data.withColumn(column, F.split(F.col(column), '-')[1])

## Read data

In [5]:
data = spark.read.csv(
    path,
    inferSchema=True,
    header=True
)

In [6]:
data.printSchema()

root
 |-- account_id: integer (nullable = true)
 |-- installment: double (nullable = true)
 |-- loan_amount: double (nullable = true)
 |-- interest_rate: double (nullable = true)
 |-- term: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- description: string (nullable = true)
 |-- title: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: string (nullable = true)
 |-- employment_length: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- earliest_credit_line: string (nullable = true)
 |-- public_records: string (nullable = true)
 |-- last_record_months: string (nullable = true)
 |-- last_delinquency_months: string (nullable = true)
 |-- last_derog_months: string (nullable = true)
 |-- delinquency_2y: string (nullable = true)
 |-- inquiries_6m: string (nullable = true)
 |-- open_accounts: string (nullable = true)
 |-- debt_to_income: string (nullable = true)
 |-- credi

## Data Cleansing

In [7]:
data = drop_na_cols(data=data, pct=0.5)

In [8]:
data.dtypes

[('account_id', 'int'),
 ('installment', 'double'),
 ('loan_amount', 'double'),
 ('interest_rate', 'double'),
 ('term', 'string'),
 ('purpose', 'string'),
 ('issue_date', 'string'),
 ('title', 'string'),
 ('home_ownership', 'string'),
 ('annual_income', 'string'),
 ('employment_length', 'string'),
 ('job_title', 'string'),
 ('earliest_credit_line', 'string'),
 ('public_records', 'string'),
 ('delinquency_2y', 'string'),
 ('inquiries_6m', 'string'),
 ('open_accounts', 'string'),
 ('debt_to_income', 'string'),
 ('credit_card_usage', 'string'),
 ('credit_card_balance', 'string'),
 ('total_current_balance', 'string'),
 ('nr_accounts', 'string'),
 ('loan_status', 'string'),
 ('amount_payed', 'string'),
 ('year', 'string'),
 ('district', 'string'),
 ('postcode_district', 'string'),
 ('credit_score', 'string')]

In [9]:
data = lower_case_cols(data)

In [10]:
data.dtypes

[('account_id', 'int'),
 ('installment', 'double'),
 ('loan_amount', 'double'),
 ('interest_rate', 'double'),
 ('term', 'string'),
 ('purpose', 'string'),
 ('issue_date', 'string'),
 ('title', 'string'),
 ('home_ownership', 'string'),
 ('annual_income', 'string'),
 ('employment_length', 'string'),
 ('job_title', 'string'),
 ('earliest_credit_line', 'string'),
 ('public_records', 'string'),
 ('delinquency_2y', 'string'),
 ('inquiries_6m', 'string'),
 ('open_accounts', 'string'),
 ('debt_to_income', 'string'),
 ('credit_card_usage', 'string'),
 ('credit_card_balance', 'string'),
 ('total_current_balance', 'string'),
 ('nr_accounts', 'string'),
 ('loan_status', 'string'),
 ('amount_payed', 'string'),
 ('year', 'string'),
 ('district', 'string'),
 ('postcode_district', 'string'),
 ('credit_score', 'string')]

In [11]:
data = remove_whitespace(data)

In [12]:
data.dtypes

[('account_id', 'int'),
 ('installment', 'double'),
 ('loan_amount', 'double'),
 ('interest_rate', 'double'),
 ('term', 'string'),
 ('purpose', 'string'),
 ('issue_date', 'string'),
 ('title', 'string'),
 ('home_ownership', 'string'),
 ('annual_income', 'string'),
 ('employment_length', 'string'),
 ('job_title', 'string'),
 ('earliest_credit_line', 'string'),
 ('public_records', 'string'),
 ('delinquency_2y', 'string'),
 ('inquiries_6m', 'string'),
 ('open_accounts', 'string'),
 ('debt_to_income', 'string'),
 ('credit_card_usage', 'string'),
 ('credit_card_balance', 'string'),
 ('total_current_balance', 'string'),
 ('nr_accounts', 'string'),
 ('loan_status', 'string'),
 ('amount_payed', 'string'),
 ('year', 'string'),
 ('district', 'string'),
 ('postcode_district', 'string'),
 ('credit_score', 'string')]

In [13]:
data = make_col_numeric(data, 'credit_score')

In [14]:
data = make_col_numeric(data, 'annual_income')

In [15]:
data = truncate_credit_line(data, 'earliest_credit_line')

In [18]:
data[['annual_income']].show()

+-------------+
|annual_income|
+-------------+
|        85000|
|        54000|
|        32000|
|        58000|
|        80800|
|       148000|
|        45000|
|        54000|
|        60000|
|        27000|
|        24000|
|        56000|
|       100000|
|       110000|
|        46000|
|       125000|
|        91000|
|        84996|
|        42000|
|        40000|
+-------------+
only showing top 20 rows



In [17]:
data.createTempView('df')