## Imports

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DateType
from pyspark.ml.feature import StringIndexer, MinMaxScaler, VectorAssembler
from pprint import pprint

## Variables

In [2]:
path = f'../data/interim/loans_clean_spark/part-*.csv'

In [3]:
distinct_count = 'distinct_count'

In [4]:
Id = ['account_id']
target = ['class']

## Spark Session

In [5]:
spark = SparkSession.builder.appName('Features').getOrCreate()

## Spark Functions

In [6]:
def get_nuniques(data):
    nuniques = data.agg(*(F.countDistinct(F.col(column)).alias(column) for column in data.columns))
    nuniques = nuniques.toPandas().transpose()
    nuniques.columns = [distinct_count]
    
    return nuniques

def binarize(data, column):
    output_name = f'binary_{column}'
    
    encoder = StringIndexer(
    inputCol=column,
    outputCol=output_name
    )
    
    model = encoder.fit(data)
    binarized_data = model.transform(data)
    
    binarized_data = binarized_data.drop(column)
    
    return binarized_data

def create_dummies(data, identifier, column):
    categories = data.select(column).distinct().rdd.flatMap(lambda x: x).collect()
    
    exprs = [F.when(F.col(column) == category, 1).otherwise(0).alias(f'{column}_{category}') for category in categories]
    
    return data.select(identifier, *exprs)

def min_max_scale(data, column):
    feature_name = f'feature_{column}'
    feature_assembler = VectorAssembler(
        inputCols=[column],
        outputCol=feature_name
    )
    scaler = MinMaxScaler(
        inputCol=feature_name,
        outputCol=f'scaled_{column}'
    )
    assembler = feature_assembler.transform(data)
    model = scaler.fit(assembler)
    encoded_data = model.transform(assembler)
    encoded_data = encoded_data.drop(*[column,feature_name])
    
    return encoded_data

## Read data

In [7]:
data = spark.read.csv(
    path,
    inferSchema=True,
    header=True
)

In [8]:
data.printSchema()

root
 |-- account_id: integer (nullable = true)
 |-- installment: double (nullable = true)
 |-- loan_amount: double (nullable = true)
 |-- interest_rate: double (nullable = true)
 |-- term: integer (nullable = true)
 |-- purpose: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: integer (nullable = true)
 |-- employment_length: string (nullable = true)
 |-- public_records: double (nullable = true)
 |-- delinquency_2y: double (nullable = true)
 |-- inquiries_6m: string (nullable = true)
 |-- open_accounts: double (nullable = true)
 |-- debt_to_income: double (nullable = true)
 |-- credit_card_usage: double (nullable = true)
 |-- credit_card_balance: double (nullable = true)
 |-- total_current_balance: double (nullable = true)
 |-- nr_accounts: double (nullable = true)
 |-- credit_score: integer (nullable = true)
 |-- credit_age_years: integer (nullable = true)
 |-- class: integer (nullable = true)



## Create Features

In [9]:
nuniques = data.agg(*(F.countDistinct(F.col(column)).alias(column) for column in data.columns)).toPandas().transpose()
nuniques.columns = [distinct_count]

binary_vars = [var for var in nuniques[nuniques.values == 2].index if var not in target]
categorical_vars = [var for var in nuniques[nuniques.values <=5].index if var not in target+binary_vars]
numerical_vars = [var for var in data.columns if var not in Id + target + categorical_vars + binary_vars]

In [10]:
for column in binary_vars:
    data = binarize(data, column)

In [11]:
for column in categorical_vars:
    dummy_var = create_dummies(data, Id[0], column)
    
    data = data.join(dummy_var, data[Id[0]] == dummy_var[Id[0]])
    data = data.drop(dummy_var[Id[0]])
    data = data.drop(column)

In [12]:
for var in numerical_vars:
    data = min_max_scale(data, var)

+-----+-----------+-------------+--------------------------+-------------------+------------------+--------------------+-----------------------+-------------------+----------------------+------------------------+----------------------+-----------------------+----------------------+----------+----------------------+--------------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+--------------------+---------------------+------------------------+--------------------------+----------------------------+--------------------+--------------------+-----------------------+
|class|binary_term|purpose_other|purpose_debt_consolidation|purpose_credit_card|home_ownership_own|home_ownership_other|home_ownership_mortgage|home_ownership_rent|employment_length_7to9|employment_length_10plus|employment_length_1to3|employment_length_less1|employment_length_4to6|account_id|inquiries_6m_1_inquir

In [14]:
data = data.drop(*Id)

In [15]:
data.printSchema()

root
 |-- class: integer (nullable = true)
 |-- binary_term: double (nullable = false)
 |-- purpose_other: integer (nullable = false)
 |-- purpose_debt_consolidation: integer (nullable = false)
 |-- purpose_credit_card: integer (nullable = false)
 |-- home_ownership_own: integer (nullable = false)
 |-- home_ownership_other: integer (nullable = false)
 |-- home_ownership_mortgage: integer (nullable = false)
 |-- home_ownership_rent: integer (nullable = false)
 |-- employment_length_7to9: integer (nullable = false)
 |-- employment_length_10plus: integer (nullable = false)
 |-- employment_length_1to3: integer (nullable = false)
 |-- employment_length_less1: integer (nullable = false)
 |-- employment_length_4to6: integer (nullable = false)
 |-- inquiries_6m_1_inquiry: integer (nullable = false)
 |-- inquiries_6m_2plus_inquiry: integer (nullable = false)
 |-- inquiries_6m_no_inquiry: integer (nullable = false)
 |-- scaled_installment: vector (nullable = true)
 |-- scaled_loan_amount: vector

In [18]:
[var for var in data.columns if var not in target]

['binary_term',
 'purpose_other',
 'purpose_debt_consolidation',
 'purpose_credit_card',
 'home_ownership_own',
 'home_ownership_other',
 'home_ownership_mortgage',
 'home_ownership_rent',
 'employment_length_7to9',
 'employment_length_10plus',
 'employment_length_1to3',
 'employment_length_less1',
 'employment_length_4to6',
 'inquiries_6m_1_inquiry',
 'inquiries_6m_2plus_inquiry',
 'inquiries_6m_no_inquiry',
 'scaled_installment',
 'scaled_loan_amount',
 'scaled_interest_rate',
 'scaled_annual_income',
 'scaled_public_records',
 'scaled_delinquency_2y',
 'scaled_open_accounts',
 'scaled_debt_to_income',
 'scaled_credit_card_usage',
 'scaled_credit_card_balance',
 'scaled_total_current_balance',
 'scaled_nr_accounts',
 'scaled_credit_score',
 'scaled_credit_age_years']

In [17]:
features = VectorAssembler(
    inputCols = [var for var in data.columns if var not in target],
    outputCol = f'features'
)
output = features.transform(data)
output.show()

+-----+-----------+-------------+--------------------------+-------------------+------------------+--------------------+-----------------------+-------------------+----------------------+------------------------+----------------------+-----------------------+----------------------+----------------------+--------------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+--------------------+---------------------+------------------------+--------------------------+----------------------------+--------------------+--------------------+-----------------------+--------------------+
|class|binary_term|purpose_other|purpose_debt_consolidation|purpose_credit_card|home_ownership_own|home_ownership_other|home_ownership_mortgage|home_ownership_rent|employment_length_7to9|employment_length_10plus|employment_length_1to3|employment_length_less1|employment_length_4to6|inquiries_6m_1_inquiry