## Lending Club Case Study -- solution


**Step 1: Importing Necessary Libraries**

In [1]:
# Analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Extra
import warnings
warnings.filterwarnings(action="ignore")

**Step 2: Data Loading**

In [3]:
df_loan = pd.read_csv("loan.csv")

**Step 3: View Top5 records of dataframe**

In [None]:
df_loan.head()

**Step 4: Check Shape, Info & dtypes details**

In [None]:
print("|------df_loan.shape----------|")
print(df_loan.shape)
print("|------df_loan.info()---------|")
print(df_loan.info())
print("|-----------------------------|")

**Step 5: List out int64 type features from dataFrame**

In [None]:
print("|------int64 type features---------|")
df_loan_int64 = df_loan.select_dtypes(include='int64')
int64_type_features = df_loan_int64.columns
print(int64_type_features)
print("|----------------------------------|")

**Step 6: List out float64 type features from dataFrame**

In [None]:
print("|------float64 type features---------|")
df_loan_float64 = df_loan.select_dtypes(include='float64')
float64_type_features = df_loan_float64.columns
print(float64_type_features)
print("|------------------------------------|")

**Step 7: List out object type features from dataFrame**

In [None]:
print("|------object type features---------|")
df_loan_object = df_loan.select_dtypes(include='object')
object_type_features = df_loan_object.columns
print(object_type_features)
print("|-----------------------------------|")

**Step 8: Check info() of all int64-type features**

In [None]:
df_loan_int64.info()

**Step 9: Check info() of all float64-type features**

In [None]:
df_loan_float64.info()

**Step 10: Check info() of all object-type features**

In [None]:
df_loan_object.info()

**Step 10: Find the duplicate-rows in the *df_loan* dataFrame**

In [None]:
duplicate_rows_in_df_loan = df_loan.duplicated()
df_loan[duplicate_rows_in_df_loan]

# OBSERVATION: We see that there are zero-duplicated-rows in the df_loan dataFrame

**Step 11: Find the number-of-rows in the *df_loan* dataFrame that are empty**

In [None]:
number_of_empty_rows = df_loan.isnull().all(axis=1).sum()
number_of_empty_rows

# OBSERVATION: There are zero-empty-rows in the df_loan dataFrame

**Step 12: Find the number-of-features in the *df_loan* dataFrame that are empty**

In [None]:
number_of_empty_features = df_loan.isnull().all(axis=0).sum()
number_of_empty_features

# OBSERVATION: There are 54-empty-features in the df_loan dataFrame

**Step 12A: List the features from *df_loan* dataFrame which are empty**

In [None]:
empty_features = [col for col in df_loan.columns if df_loan[col].isnull().all()]
print(empty_features)

**Step 12B: Impute the features from *df_loan* dataFrame which are empty**

In [16]:
df_loan.drop(empty_features, axis=1, inplace=True)

**Step 12C: Check the current Shape of *df_loan* dataFrame after imputation**

In [None]:
print("|------current df_loan.shape----------|")
print(df_loan.shape)
print("|-----------------------------|")

# OBSERVATION: 48% of the features are imputed at this point.

**Step 13: Find number-of-null-values in the features which have int64-dtype**

In [None]:
df_loan_int64 = df_loan.select_dtypes(include='int64')
df_loan_int64.isna().sum()
# null_columns_df_loan_int64 = df_loan_int64.columns[df_loan_int64.isna().any()].to_list()
# null_columns_df_loan_int64

drop_features_df_loan_int64 = []
drop_features_df_loan_int64

**Step 14: List the features which have float64-dtype and have many nulls**

In [None]:
df_loan_float64 = df_loan.select_dtypes(include='float64')
df_loan_float64.isna().sum()
# null_columns_df_loan_float64 = df_loan_float64.columns[df_loan_float64.isna().any()].to_list()
# null_columns_df_loan_float64

drop_features_df_loan_float64 = ['mths_since_last_delinq','mths_since_last_record']
drop_features_df_loan_float64

**Step 15: List the features which have object-dtype and have many nulls**

In [None]:
df_loan_object = df_loan.select_dtypes(include='object')
df_loan_object.isna().sum()
# null_columns_df_loan_object = df_loan_object.columns[df_loan_object.isna().any()].to_list()
# null_columns_df_loan_object

drop_features_df_loan_object = ['desc','next_pymnt_d']
drop_features_df_loan_object

**Step 16: Drop the above selected features**

In [21]:
df_loan.drop(drop_features_df_loan_int64, axis=1, inplace=True)
df_loan.drop(drop_features_df_loan_float64, axis=1, inplace=True)
df_loan.drop(drop_features_df_loan_object, axis=1, inplace=True)

In [None]:
df_loan.shape

**Step 17: Create a dataFrame containing only DEFAULTED-LOAN records from *df_loan* dataFrame**

In [None]:
df_loan.loan_status.unique()

# ASSUMPTION: 'Charged Off' means DEFAULTED.

In [24]:
df_loan_defaulted = df_loan[df_loan["loan_status"]=='Charged Off']

In [None]:
df_loan_defaulted.shape

**Step 18: Segregate *df_loan_defaulted* dataFrame into Numerical and Categorical**

In [None]:
df_loan_defaulted_numeric = df_loan_defaulted.select_dtypes(include=['float64','int64'])
df_loan_defaulted_numeric_features = df_loan_defaulted_numeric.columns
print(df_loan_defaulted_numeric_features)

In [None]:
df_loan_defaulted_categorical = df_loan_defaulted.select_dtypes(include=['object'])
df_loan_defaulted_categorical_features = df_loan_defaulted_categorical.columns
print(df_loan_defaulted_categorical_features)

**Step 19: Further cleanup of DEFAULTED-Numeric,-Categorical dataFrames values**

In [28]:
# Replace NULL with MEDIAN-value of feature "pub_rec_bankruptcies"

median_pub_rec_bankruptcies = df_loan_defaulted["pub_rec_bankruptcies"].median()
df_loan_defaulted["pub_rec_bankruptcies"] = df_loan_defaulted["pub_rec_bankruptcies"].fillna(median_pub_rec_bankruptcies)


In [None]:
df_loan_defaulted_numeric.isna().sum()

In [None]:
df_loan_defaulted_categorical.isna().sum()

In [None]:
df_loan_defaulted_categorical.emp_length.head()

**Step 20: Histograms for Numerical-Features (Univariate)**

In [None]:
for col in df_loan_defaulted_numeric_features:
    sns.histplot(x=df_loan_defaulted[col])
    plt.show()

**Step 21: Boxplots for Numerical-Features (Univariate)**

In [None]:
for col in df_loan_defaulted_numeric_features:
    sns.boxplot(x=df_loan_defaulted[col])
    plt.show()

**Step 22: Countplots for Categorical-Features (Univariate)**

In [None]:
for col in df_loan_defaulted_categorical_features:
    sns.countplot(x=df_loan_defaulted[col])
    plt.show()

**Step 23: Scatterplots for Categorical-vs-Categorical Features (Bivariate)**

In [None]:
for col1 in df_loan_defaulted_numeric_features:
    for col2 in df_loan_defaulted_numeric_features:
        if col1!=col2:
            print("ScatterPlot of ",col1,"and",col2)
            sns.scatterplot(x=df_loan_defaulted[col1], y=df_loan_defaulted[col2])
            sns.regplot(x=df_loan_defaulted[col1],y=df_loan_defaulted[col2],scatter_kws={"color":"blue"},line_kws={"color":"red"})
            plt.show()