In [1]:
import pandas as pd
from IPython.display import display 

loan_payments = pd.read_csv('C:/Users/admin/EDA - Customer Loans/transformed_data.csv')

categorical_columns = ['term', 'grade', 'sub_grade', 'home_ownership', 'verification_status',
                        'loan_status', 'payment_plan', 'purpose', 'employment_length']

# Explicitly convert specified categorical columns to category
loan_payments[categorical_columns] = loan_payments[categorical_columns].astype('category')

class DataFrameInfo:
    def __init__(self, df):
        self.df = df

    def describe_columns(self):
        """Describe all columns in the DataFrame to check their data types."""
        return self.df.dtypes
    
    def extract_statistical_values(self):
        """Extract statistical values: median, standard deviation, and mean from the columns and the DataFrame."""
        display(self.df.describe(include='all'))

    def count_distinct_values(self):
        """Count distinct values in categorical columns"""
        categorical_columns = self.df.select_dtypes(include='category').columns
        distinct_counts = {col: self.df[col].nunique() for col in categorical_columns}
        return distinct_counts
    
    def print_dataframe_shape(self):
        """Print out the shape of the DataFrame."""
        print(f"Number of rows: {self.df.shape[0]}, Number of columns: {self.df.shape[1]}")

    def generate_null_count(self):
        """Generate a count/percentage count of NULL values in each column"""
        null_counts = self.df.isnull().sum()
        percentage_nulls = (null_counts / len(self.df)) * 100
        null_info = pd.DataFrame({'Null Count' : null_counts, 'Percentage Nulls':percentage_nulls})
        return(null_info)
    
loan_payments_info = DataFrameInfo(loan_payments)

print("1. Describe Columns:")
display(loan_payments_info.describe_columns())

print("\n2. Extract Statistical Values:")
display(loan_payments.describe(include='all'))

print("\n3. Count of Distinct Values in Categorical Columns:")
distinct_values_result = loan_payments_info.count_distinct_values()
display(distinct_values_result)

print("\n4. Print DataFrame Shape:")
loan_payments_info.print_dataframe_shape()

print("\n5. Generate Null Count:")
null_info = loan_payments_info.generate_null_count()
display(null_info)


1. Describe Columns:


id                                int64
member_id                         int64
loan_amount                     float64
funded_amount                   float64
funded_amount_inv               float64
term                           category
int_rate                        float64
instalment                      float64
grade                          category
sub_grade                      category
employment_length              category
home_ownership                 category
annual_inc                      float64
verification_status            category
issue_date                       object
loan_status                    category
payment_plan                   category
purpose                        category
dti                             float64
delinq_2yrs                     float64
earliest_credit_line             object
inq_last_6mths                  float64
mths_since_last_delinq          float64
mths_since_last_record          float64
open_accounts                   float64



2. Extract Statistical Values:


Unnamed: 0,id,member_id,loan_amount,funded_amount,funded_amount_inv,term,int_rate,instalment,grade,sub_grade,...,recoveries,collection_recovery_fee,last_payment_date,last_payment_amount,next_payment_date,last_credit_pull_date,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type
count,54231.0,54231.0,54231.0,51224.0,54231.0,49459,49062.0,54231.0,54231,54231,...,54231.0,54231.0,54158,54231.0,21623,54224,54180.0,7499.0,54231.0,54231
unique,,,,,,2,,,7,35,...,,,98,,96,101,,,,1
top,,,,,,36 months,,,B,B3,...,,,2022-01-01,,2022-02-01,2022-01-01,,,,INDIVIDUAL
freq,,,,,,35845,,,16369,3641,...,,,15569,,18136,32136,,,,54231
mean,7621797.0,8655350.0,13333.0761,13229.509117,12952.622979,,13.507328,400.013953,,,...,93.501288,10.859057,,3130.706393,,,0.004208,42.253634,1.0,
std,9571362.0,10312810.0,8082.196709,8019.017599,8099.473527,,4.392893,238.920012,,,...,630.843636,120.19395,,5323.801675,,,0.07099,21.05236,0.0,
min,55521.0,70694.0,500.0,500.0,0.0,,5.42,15.67,,,...,0.0,0.0,,0.0,,,0.0,0.0,1.0,
25%,759433.0,958772.0,7000.0,7000.0,6700.0,,10.37,224.205,,,...,0.0,0.0,,289.79,,,0.0,26.0,1.0,
50%,7084590.0,8709873.0,12000.0,12000.0,11300.0,,13.16,347.15,,,...,0.0,0.0,,562.67,,,0.0,42.0,1.0,
75%,8860616.0,10527140.0,18000.0,18000.0,18000.0,,16.2,527.55,,,...,0.0,0.0,,3738.12,,,0.0,59.0,1.0,



3. Count of Distinct Values in Categorical Columns:


{'term': 2,
 'grade': 7,
 'sub_grade': 35,
 'employment_length': 0,
 'home_ownership': 5,
 'verification_status': 3,
 'loan_status': 9,
 'payment_plan': 2,
 'purpose': 14}


4. Print DataFrame Shape:
Number of rows: 54231, Number of columns: 43

5. Generate Null Count:


Unnamed: 0,Null Count,Percentage Nulls
id,0,0.0
member_id,0,0.0
loan_amount,0,0.0
funded_amount,3007,5.544799
funded_amount_inv,0,0.0
term,4772,8.799395
int_rate,5169,9.531449
instalment,0,0.0
grade,0,0.0
sub_grade,0,0.0
