In [1]:
import pandas as pd
import transformed_data as data

In [2]:
loans = data.loans
print(loans.shape)
print(loans.info())

(54231, 43)
<class 'pandas.core.frame.DataFrame'>
Index: 54231 entries, 0 to 54230
Data columns (total 43 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   id                           54231 non-null  int64         
 1   member_id                    54231 non-null  int64         
 2   loan_amount                  54231 non-null  int64         
 3   funded_amount                51224 non-null  float64       
 4   funded_amount_inv            54231 non-null  float64       
 5   term_months                  49459 non-null  float64       
 6   int_rate                     49062 non-null  float64       
 7   instalment                   54231 non-null  float64       
 8   grade                        54231 non-null  category      
 9   sub_grade                    54231 non-null  category      
 10  employment_years             52113 non-null  category      
 11  home_ownership               54231

## Possible class methods:
- Describe all columns in the DataFrame to check their data types - dtypes method
- Extract statistical values: median, standard deviation and mean from the columns and the DataFrame
- Count distinct values in categorical columns - distinct method
- Print out the shape of the DataFrame - shape method
- Generate a count/percentage count of NULL values in each column - percent_na method
- Any other methods you may find useful

In [4]:
class DataFrameInfo():
    def __init__(self, data):
        self.data = data

    def percent_na(self):
        pd.set_option("display.precision", 2)
        missing_values = ((self.data.isnull().mean()).round(4)) * 100
        missing_values_only = missing_values[missing_values > 0]
        print("Column               Missing values (%) \n", missing_values_only)

    def shape(self):
        shape = self.data.shape
        print(f"The shape of the dataframe or array is: {shape}")

    def distinct(self, column):
        distinct_cats = list(self.data[column].dropna().unique())
        print("Number of categories (excluding nulls):", len(list(distinct_cats)))
        print("Distinct categories:", *distinct_cats, sep='\n')

    def dtypes(self):
        dtypes = self.data.dtypes
        print("Column                        Data type \n", dtypes)

    def df_summary_stats(self):
        pd.set_option("display.precision", 2)
        df_stats = self.data.dropna().describe()
        print(df_stats)

    def column_summary_stats(self, column):
        pd.set_option("display.precision", 2)
        col_stats = self.data[column].dropna().describe()
        print(col_stats)

    def column_stats(self, column):
        pd.set_option("display.precision", 4)
        mean = round(self.data[column].mean(), 3)
        median = round(self.data[column].median(), 3)
        mode = round(self.data[column].mode(), 3)
        stdev = round(self.data[column].std(), 3)
        print(f"{column}: \n mean: {mean} \n median: {median} \n mode: {mode.iloc[0] if len(list(mode)) == 1 else list(mode)} \n standard deviation: {stdev}")

In [5]:
# Create an instance of the DataFrameInfo class.
loans_info = DataFrameInfo(loans)

In [6]:
# Calculate the percentage of missing values in each column, only display results with missing values. 
loans_info.percent_na()

Column               Missing values (%) 
 funded_amount                   5.54
term_months                     8.80
int_rate                        9.53
employment_years                3.91
mths_since_last_delinq         57.17
mths_since_last_record         88.60
last_payment_date               0.13
next_payment_date              60.13
last_credit_pull_date           0.01
collections_12_mths_ex_med      0.09
mths_since_last_major_derog    86.17
dtype: float64


In [162]:
# Print the shape of the dataframe.
loans_info.shape()

The shape of the dataframe or array is: (54231, 43)


In [98]:
# Print the number of distinct categories in a series, and list the categories. 
loans_info.distinct("employment_years")

Number of categories (excluding nulls): 11
Distinct categories:
5 years
9 years
8 years
1 year
10+ years
< 1 year
7 years
3 years
4 years
6 years
2 years


In [143]:
# Find mean, median, mode and stdev of column.
loans_info.column_stats("funded_amount")

funded_amount: 
 mean: 13229.509 
 median: 12000.0 
 mode: 10000.0 
 standard deviation: 8019.018


In [171]:
# Print dataframe summary stats.
loans_info.df_summary_stats()

             id  member_id  loan_amount  funded_amount  funded_amount_inv  \
count  4.23e+02   4.23e+02        423.0          423.0             423.00   
mean   1.89e+07   2.08e+07      12107.8        12107.8           12104.08   
std    1.43e+07   1.50e+07       6919.9         6919.9            6918.39   
min    5.64e+06   1.07e+06       1000.0         1000.0            1000.00   
25%    7.73e+06   9.36e+06       7200.0         7200.0            7200.00   
50%    9.23e+06   1.10e+07      10075.0        10075.0           10075.00   
75%    3.80e+07   4.08e+07      15000.0        15000.0           15000.00   
max    3.86e+07   4.14e+07      35000.0        35000.0           35000.00   

       term_months  int_rate  instalment  annual_inc     dti  ...  \
count       423.00    423.00      423.00      423.00  423.00  ...   
mean         41.96     14.79      376.24    74919.60   16.03  ...   
std          10.38      3.89      209.09    42489.46    8.05  ...   
min          36.00      6.49  

In [172]:
# Print column summary stats.
loans_info.column_summary_stats("loan_amount")

count    54231.00
mean     13333.08
std       8082.20
min        500.00
25%       7000.00
50%      12000.00
75%      18000.00
max      35000.00
Name: loan_amount, dtype: float64


In [108]:
# Print dataframe datatypes.
loans_info.dtypes()

Column                        Data type 
 id                                      int64
member_id                               int64
loan_amount                             int64
funded_amount                         float64
funded_amount_inv                     float64
term_months                           float64
int_rate                              float64
instalment                            float64
grade                                category
sub_grade                            category
employment_years                     category
home_ownership                       category
annual_inc                            float64
verification_status                    object
issue_date                             object
loan_status                            object
payment_plan                         category
purpose                                object
dti                                   float64
delinq_2yrs                             int64
earliest_credit_line                  