In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
from pyspark.mllib.stat import Statistics

In [2]:
spark = (SparkSession.builder.appName("Day 9").master("local[1]").enableHiveSupport().getOrCreate())

## Data Preparation

### Bank Analysis

<b>Context</b>

Banks play a crucial role in market economies. They decide who can obtain financing and on what terms and can make or break investment decisions. For markets and society to function, individuals and companies need access to credit. 
 
Banks use credit scoring algorithms to determine whether or not a loan should be granted by calculating the probability of default (unable to repay the loan). This assignment requires you to improve the credit scoring algorithm by predicting the probability that somebody will experience financial distress in the next two years.
  
Historical data are provided on 250,000 borrowers. See below dataset.

<b>Content</b>
Training, Test, Sample Entry and Submission Files are provided. Please check the format of the submission file.

<b>Dataset</b>
cs-training.csv
The dataset can be downloaded here: https://github.com/jxchen/Kaggle/blob/master/Give%20Me%20Some%20Credit/cs-training.csv

<br>)

In [3]:
file = 'C:/Users/Lenovo/Documents/cs-training.csv'

In [4]:
df = (spark.read.format('csv')
     .option("header", "true")
     .option("mode", "FAILFAST")
     .option("NA", '')
     .load(file))

In [5]:
df.columns

['no',
 'SeriousDlqin2yrs',
 'RevolvingUtilizationOfUnsecuredLines',
 'age',
 'NumberOfTime30-59DaysPastDueNotWorse',
 'DebtRatio',
 'MonthlyIncome',
 'NumberOfOpenCreditLinesAndLoans',
 'NumberOfTimes90DaysLate',
 'NumberRealEstateLoansOrLines',
 'NumberOfTime60-89DaysPastDueNotWorse',
 'NumberOfDependents']

In [6]:
df = df.drop('no')

In [7]:
#change dataframe to rdd
rdd = df.rdd

In [8]:
#convert string values to int, float
#clean data
rdds = rdd.map(lambda x: (int(x[0]), float(x[1]), int(x[2]), 
                          int(x[3]), float(x[4]), 
                          int(x[5].replace('NA', '0')), 
                          int(x[6]), int(x[7]), int(x[8]), 
                          int(x[9].replace('NA', '0')),
                          int(x[10].replace('NA', '0'))))

In [9]:
rdds = rdds.distinct()

In [10]:
rdds.take(10)

[(1, 0.766126609, 45, 2, 0.802982129, 9120, 13, 0, 6, 0, 2),
 (0, 0.957151019, 40, 0, 0.121876201, 2600, 4, 0, 0, 0, 1),
 (0, 0.65818014, 38, 1, 0.085113375, 3042, 2, 1, 0, 0, 0),
 (0, 0.233809776, 30, 0, 0.036049682, 3300, 5, 0, 0, 0, 0),
 (0, 0.9072394, 49, 1, 0.024925695, 63588, 7, 0, 1, 0, 0),
 (0, 0.213178682, 74, 0, 0.375606969, 3500, 3, 0, 1, 0, 1),
 (0, 0.305682465, 57, 0, 5710.0, 0, 8, 0, 3, 0, 0),
 (0, 0.754463648, 39, 0, 0.209940017, 3500, 8, 0, 0, 0, 0),
 (0, 0.116950644, 27, 0, 46.0, 0, 2, 0, 0, 0, 0),
 (0, 0.189169052, 57, 0, 0.606290901, 23684, 9, 0, 4, 0, 2)]

### `1.	The overall statistics of overdue credit cards.`

#### <span style="color:red"> finding overdue credit card: (the total balance of credit card * monthly income) * no. of loans </span>

In [30]:
overdue_credit_cards = rdds.filter(lambda x: x[0] != 0) \
.map(lambda x: (x[0], int(((x[1]*x[5])*x[6]))))

In [31]:
import numpy as np
from pyspark.mllib.stat import Statistics

summary = Statistics.colStats(overdue_credit_cards)
print(summary.mean()[-1])
print(summary.variance()[-1])
print(summary.numNonzeros()[-1])

110862.91761647673
10346165813815.443
7710.0


### ` 2.	Combined statistics of age and overdue credit card.`

#### <span style="color:red"> finding overdue credit card: (the total balance of credit card * monthly income) * no. of loans </span>

In [13]:
age_credit_card = rdds.filter(lambda x: x[0] != 0) \
.map(lambda x: (x[2], int(((x[1]*x[5])*x[6])))) \
.filter(lambda x: x[1] != 0)

summary = Statistics.colStats(age_credit_card)
print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())

[4.61298314e+01 1.43819832e+05]
[1.57992611e+02 1.34174938e+13]
[7710. 7710.]


### `3.	Combined statistics of the number of real estate mortgage and the overdue credit card.`

#### <span style="color:red"> finding overdue credit card: (the total balance of credit card * monthly income) * no. of loans </span>

In [14]:
mortgage_credit_card = rdds.filter(lambda x: x[0] != 0) \
.map(lambda x: (x[8], int(((x[1]*x[5])*x[6])*x[7]))) \
.filter(lambda x: x[1] != 0)

summary = Statistics.colStats(mortgage_credit_card)
print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())

[7.46063787e-01 1.76150864e+05]
[1.02555498e+00 1.55439797e+13]
[1200. 2477.]


### `4.	Combined statistics of the number of family members and the overdue credit card.`

#### <span style="color:red"> finding overdue credit card: (the total balance of credit card * monthly income) * no. of loans </span>

In [15]:
family_credit_card = rdds.filter(lambda x: x[0] != 0) \
.map(lambda x: (x[10], int(((x[1]*x[5])*x[6])))) \
.filter(lambda x: x[1] != 0)

summary = Statistics.colStats(family_credit_card)
print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())

[1.06044099e+00 1.43819832e+05]
[1.54103444e+00 1.34174938e+13]
[4163. 7710.]


### `5.	Combined statistics of monthly income and overdue credit card.`

#### <span style="color:red"> finding overdue credit card: (the total balance of credit card * monthly income) * no. of loans </span>

In [16]:
income_credit_card = rdds.filter(lambda x: x[0] != 0) \
.map(lambda x: (x[5], int(((x[1]*x[5])*x[6])))) \
.filter(lambda x: x[1] != 0)

summary = Statistics.colStats(income_credit_card)
print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())

[  5751.10077821 143819.83164721]
[3.29908276e+07 1.34174938e+13]
[7710. 7710.]


### `6.	Statistics of overdue users of different ages.`

In [17]:
overdue_users_age = rdds.filter(lambda x: x[0] != 0) \
.map(lambda x: (int(((x[1]*x[5])*x[6])), x[2])) \
.filter(lambda x: x[1] != 0)

In [18]:
mapvalues = overdue_users_age.mapValues(lambda x: (x, 1))
reducebykey = mapvalues.reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1]))
maps = reducebykey.map(lambda x: (x[0], x[1][0]))

In [19]:
summary = Statistics.colStats(maps)
print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())

[1.58344842e+05 6.62169404e+01]
[1.48999592e+13 1.55771384e+06]
[6941. 6942.]


### `7.	Among the overdue users, how many people have monthly income of less than 10,000 and have real estate loans, and what is the total proportion?`

In [26]:
monthlyincome_1000 = rdds.filter(lambda x: x[5] < 10000) \
.filter(lambda x: x[8] > 0) \
.filter(lambda x: x[0] != 0) \
.count()

In [27]:
monthlyincome_1000/rdd.count()

0.03084

### `8.	Among the overdue users, how many people with monthly income of less than 10,000 and have more than 3 family members, and what is the total proportion?`

In [22]:
monthlyincome_fam = rdds.filter(lambda x: x[5] < 10000) \
.filter(lambda x: x[10] > 3) \
.count()

In [23]:
monthlyincome_fam/rdd.count()

0.018593333333333333

### `9.	What is the total amount of credit card outstanding among all overdue users?`

In [24]:
total_credit_card = rdds.map(lambda x: int(((x[1]*x[5])*x[6]))) \
.filter(lambda x: x != 0) \
.reduce(lambda x,y: x+y)

In [25]:
total_credit_card

51669782474