# Lending Club Case Study

## Initializing all imports of libraries needed

In [23]:
#importing all required libraries 
import numpy as np
import pandas as pd



## Data retrival and initial Observations

In [24]:
#importing the csv file and assigning it to variable data. Got an error initially and added the paramater low_memory = Falseto fix the issue.
#The error message got without the flag is related to the pandas. It indicates that there is a warning about mixed data types in column 47 of the CSV file that was being read
#Setting low_memory=False disables the memory optimization when reading the file, which can help resolve the warning.

data = pd.read_csv("loan.csv", low_memory=False)

In [25]:
#preview the data in the csv
data.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,1296599,5000,5000,4975.0,36 months,10.65%,162.87,B,B2,...,,,,,0.0,0.0,,,,
1,1077430,1314167,2500,2500,2500.0,60 months,15.27%,59.83,C,C4,...,,,,,0.0,0.0,,,,
2,1077175,1313524,2400,2400,2400.0,36 months,15.96%,84.33,C,C5,...,,,,,0.0,0.0,,,,
3,1076863,1277178,10000,10000,10000.0,36 months,13.49%,339.31,C,C1,...,,,,,0.0,0.0,,,,
4,1075358,1311748,3000,3000,3000.0,60 months,12.69%,67.79,B,B5,...,,,,,0.0,0.0,,,,


In [26]:
#check the shape of the data
data.shape

(39717, 111)

On observing the data from the csv , it is clear that there are 39717 rows of data with 111 columns in the file. For the purpose of this assignment , we might not need all data present and hence some columns can be discarded . We will attempt to identify irrelavent data for this assignment in the following steps

## Data Clean Up Process

In [27]:
# we have 111 columns in the csv file and on examination of the data initially, we found there are multiple columns with NaN as data for all rows 
# as part of cleaning up the data we have, we will first remove these columns as they will not contribute to our analysis

# first lets check if such columns exist from out initial hypothesis by getting sum of all null rows for every column
data.isnull().sum()

id                                0
member_id                         0
loan_amnt                         0
funded_amnt                       0
funded_amnt_inv                   0
                              ...  
tax_liens                        39
tot_hi_cred_lim               39717
total_bal_ex_mort             39717
total_bc_limit                39717
total_il_high_credit_limit    39717
Length: 111, dtype: int64

In [28]:
#There are a lot of columns with only null values.Lets first remove them
data.dropna(axis = 1, how = 'all', inplace = True)

In [29]:
#Now lets preview the data set 
data.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599,5000,5000,4975.0,36 months,10.65%,162.87,B,B2,...,,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
1,1077430,1314167,2500,2500,2500.0,60 months,15.27%,59.83,C,C4,...,,Sep-13,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
2,1077175,1313524,2400,2400,2400.0,36 months,15.96%,84.33,C,C5,...,,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
3,1076863,1277178,10000,10000,10000.0,36 months,13.49%,339.31,C,C1,...,,Apr-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0
4,1075358,1311748,3000,3000,3000.0,60 months,12.69%,67.79,B,B5,...,Jun-16,May-16,0.0,1,INDIVIDUAL,0,0.0,0,0.0,0.0


In [30]:
# We have now reduced the number of columns to Analyse from 111 to 57. but there are still some columns remaining that 
# does not contribute to our analysis. We will now try to identify them and remove them as well

In [31]:
# upon first glance in the csv file following obeservations were made
# pymnt_plan looks like it only has value 'n'
# initial_list_status looks like it only has value 'f'
# application_type looks like it only has value 'INDIVIDUAL'
# acc_now_delinq looks like it only has 0 value
# chargeoff_within_12_mths looks like it only has 0 value
# delinq_amnt looks like it only has 0 value
# tax_liens looks like all values are 0

In [32]:
#check if assumptions are true
pymnt_plan = data['pymnt_plan'].unique()
initial_list_status = data['initial_list_status'].unique()
application_type = data['application_type'].unique()
acc_now_delinq = data['acc_now_delinq'].unique()
chargeoff_within_12_mths = data['chargeoff_within_12_mths'].unique()
delinq_amnt = data['delinq_amnt'].unique()
tax_liens = data['tax_liens'].unique()
print('pymnt_plan values = ',pymnt_plan)
print('initial_list_status values = ',initial_list_status)
print('application_type values = ',application_type)
print('acc_now_delinq values = ',acc_now_delinq)
print('chargeoff_within_12_mths values = ',chargeoff_within_12_mths)
print('delinq_amnt values = ',delinq_amnt)
print('tax_liens values = ',tax_liens)

pymnt_plan values =  ['n']
initial_list_status values =  ['f']
application_type values =  ['INDIVIDUAL']
acc_now_delinq values =  [0]
chargeoff_within_12_mths values =  [ 0. nan]
delinq_amnt values =  [0]
tax_liens values =  [ 0. nan]


In [33]:
# remove columns where all values are 0

columns_to_remove = data.columns[data.eq(0).all()]
data = data.drop(columns=columns_to_remove)
data.head()

# successfully removed 2 columns.

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,chargeoff_within_12_mths,pub_rec_bankruptcies,tax_liens
0,1077501,1296599,5000,5000,4975.0,36 months,10.65%,162.87,B,B2,...,Jan-15,171.62,,May-16,0.0,1,INDIVIDUAL,0.0,0.0,0.0
1,1077430,1314167,2500,2500,2500.0,60 months,15.27%,59.83,C,C4,...,Apr-13,119.66,,Sep-13,0.0,1,INDIVIDUAL,0.0,0.0,0.0
2,1077175,1313524,2400,2400,2400.0,36 months,15.96%,84.33,C,C5,...,Jun-14,649.91,,May-16,0.0,1,INDIVIDUAL,0.0,0.0,0.0
3,1076863,1277178,10000,10000,10000.0,36 months,13.49%,339.31,C,C1,...,Jan-15,357.48,,Apr-16,0.0,1,INDIVIDUAL,0.0,0.0,0.0
4,1075358,1311748,3000,3000,3000.0,60 months,12.69%,67.79,B,B5,...,May-16,67.79,Jun-16,May-16,0.0,1,INDIVIDUAL,0.0,0.0,0.0
