In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    appName("SparkDemo"). \
    config('spark.ui.port','0'). \
    config('spark.shuffle.useOldFethProtocol','true'). \
    config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [2]:
raw_df = spark.read \
.format("csv") \
.option("inferSchema","true") \
.option("header","true") \
.load("/public/trendytech/datasets/accepted_2007_to_2018Q4.csv")

In [3]:
raw_df.createOrReplaceTempView("lending_club_data")

In [4]:
from pyspark.sql.functions import sha2,concat_ws

In [5]:
new_df= raw_df.withColumn("name_sha2",sha2(concat_ws("||",*["emp_title","emp_length","home_ownership","annual_inc","zip_code","addr_state","grade","sub_grade","verification_status"]),256))

In [6]:
new_df.createOrReplaceTempView("newtable")

In [7]:
spark.sql("select count(1) from newtable")

count(1)
2260701


In [8]:
spark.sql("select count(distinct(name_sha2)) from newtable")

count(DISTINCT name_sha2)
2257384


In [9]:
customers_df = spark.read \
.format("csv") \
.option("inferSchema","true") \
.option("header","true") \
.load("/user/itv009959/lendingclubproject/raw/customers_data_csv")

In [10]:
customers_df

member_id,emp_title,emp_length,home_ownership,annual_inc,addr_state,zip_code,country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,annual_inc_joint,verification_status_joint
707271898dcabc8b2...,Physician Service...,3 years,RENT,40400.0,CO,801xx,USA,A,A2,Not Verified,68759.0,Individual,,
8e1ea10aca3c4ad8f...,Operations,10+ years,MORTGAGE,53000.0,AR,720xx,USA,B,B2,Source Verified,63143.0,Individual,,
1d6546a2cbc1fd240...,Underwriter,2 years,RENT,65000.0,ME,040xx,USA,B,B4,Not Verified,66695.0,Individual,,
d6208beced388988f...,Crome restorer sp...,10+ years,MORTGAGE,60000.0,IL,606xx,USA,C,C1,Not Verified,68900.0,Individual,,
b4af936688c28c165...,Program Coordinator,1 year,RENT,38000.0,FL,322xx,USA,A,A5,Not Verified,76877.0,Individual,,
2c04e047879ada04e...,Executive Director,10+ years,MORTGAGE,166000.0,IL,601xx,USA,C,C2,Not Verified,217868.0,Individual,,
39dfcd293cb7b2c17...,Emergency Managme...,4 years,MORTGAGE,81000.0,TX,761xx,USA,C,C4,Not Verified,293276.0,Individual,,
5e6e1f8ad59c71a0b...,Clinical Applicat...,3 years,MORTGAGE,82000.0,CO,801xx,USA,A,A1,Not Verified,393500.0,Individual,,
afd3b57e55eb95ed8...,Systems Analyst 3,4 years,OWN,118030.0,MI,482xx,USA,A,A3,Not Verified,82137.0,Individual,,
8b5eed45ac53a0238...,Director of Front...,4 years,RENT,62000.0,NY,110xx,USA,A,A5,Not Verified,17400.0,Individual,,


In [11]:
spark.sql(""" select id as loan_id,name_sha2 as member_id,loan_amnt,funded_amnt,
term,int_rate,installment,issue_d, loan_status,purpose,title
from newtable
""").repartition(1).write \
.option("header",True) \
.format("csv") \
.mode("overwrite") \
.option("path","/user/itv009959/lendingclubproject/raw/loans_data_csv") \
.save()

In [12]:
loans_df = spark.read \
.format("csv") \
.option("inferSchema","true") \
.option("header","true") \
.load("/user/itv009959/lendingclubproject/raw/loans_data_csv")

In [13]:
loans_df

loan_id,member_id,loan_amnt,funded_amnt,term,int_rate,installment,issue_d,loan_status,purpose,title
56633077,b59d80da191f5b573...,3000.0,3000.0,36 months,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,36 months,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...
56473345,e5a140c0922b554b9...,20000.0,20000.0,36 months,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation
56463188,e12aefc548f750777...,11200.0,11200.0,60 months,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,60 months,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation
56663266,1c4329e5f17697127...,20000.0,20000.0,60 months,13.33,458.45,Aug-2015,Charged Off,debt_consolidation,Debt consolidation
56483027,5026c86ad983175eb...,10000.0,10000.0,36 months,12.69,335.45,Aug-2015,Fully Paid,other,Other
56613385,9847d8c1e9d0b2084...,23400.0,23400.0,60 months,19.19,609.46,Aug-2015,Current,small_business,Business
56643620,8340dbe1adea41fb4...,16000.0,16000.0,36 months,5.32,481.84,Jul-2015,Fully Paid,debt_consolidation,Debt consolidation
56533114,d4de0de3ab7d79ad4...,25450.0,25450.0,36 months,27.31,1043.24,Aug-2015,Charged Off,debt_consolidation,Debt consolidation


In [14]:
spark.sql(""" select id as loan_id,total_rec_prncp,total_rec_int,total_rec_late_fee,total_pymnt,last_pymnt_amnt,last_pymnt_d,next_pymnt_d
from newtable
""").repartition(1).write \
.option("header",True) \
.format("csv") \
.mode("overwrite") \
.option("path","/user/itv009959/lendingclubproject/raw/loans_repayments_csv") \
.save()

In [15]:
loans_repayments_df = spark.read \
.format("csv") \
.option("inferSchema","true") \
.option("header","true") \
.load("/user/itv009959/lendingclubproject/raw/loans_repayments_csv")

In [16]:
loans_repayments_df

loan_id,total_rec_prncp,total_rec_int,total_rec_late_fee,total_pymnt,last_pymnt_amnt,last_pymnt_d,next_pymnt_d
6300587,12000.0,1864.87,20.8600000091,13885.7261041684,108.83,Nov-2014,
6311418,7125.0,95.92,0.0,7220.924234675899,7220.52,Sep-2013,
6321535,24000.0,7418.84,0.0,31418.84,13979.48,Apr-2016,
6300526,24000.0,2296.28,0.0,26296.2757411036,730.17,Aug-2016,
6301469,18000.0,9239.79,0.0,27239.794818997,452.24,Aug-2018,
6306881,20000.0,2469.32,0.0,22469.3219387879,16744.64,Oct-2014,
6301579,12000.0,2089.07,0.0,14089.07,4189.07,Oct-2015,
6291661,28000.0,12441.42,0.0,40441.4227009823,10660.91,Apr-2017,
6311612,3629.9,847.39,0.0,4478.25,179.13,Sep-2015,
6311704,23600.0,2077.3,0.0,25677.297703591,21445.23,Apr-2014,


In [18]:
spark.sql(""" select name_sha2 as member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,
total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
from newtable
""").repartition(1).write \
.option("header",True) \
.format("csv") \
.mode("overwrite") \
.option("path","/user/itv009959/lendingclubproject/raw/loans_defaulters_csv") \
.save()

In [19]:
loans_defaulters_df = spark.read \
.format("csv") \
.option("inferSchema","true") \
.option("header","true") \
.load("/user/itv009959/lendingclubproject/raw/loans_defaulters_csv")

In [20]:
loans_defaulters_df

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
b59d80da191f5b573...,0.0,0.0,0.0,0.0,1.0,0.0,31.0,
202d9f56ecb7c3bc9...,1.0,0.0,0.0,0.0,0.0,0.0,6.0,
e5a140c0922b554b9...,0.0,0.0,0.0,0.0,0.0,0.0,47.0,
e12aefc548f750777...,0.0,0.0,0.0,0.0,0.0,0.0,33.0,
1b3a50d854fbbf97e...,1.0,0.0,0.0,0.0,0.0,0.0,21.0,
1c4329e5f17697127...,0.0,0.0,0.0,0.0,0.0,0.0,,
5026c86ad983175eb...,0.0,0.0,1.0,0.0,2.0,0.0,,71.0
9847d8c1e9d0b2084...,1.0,0.0,2.0,0.0,0.0,0.0,6.0,63.0
8340dbe1adea41fb4...,0.0,0.0,0.0,0.0,0.0,0.0,36.0,
d4de0de3ab7d79ad4...,0.0,0.0,0.0,0.0,0.0,0.0,35.0,
