### Prepare datasets for analysis

## Lending Club Analysis

Import basic libraries

In [1]:
import numpy as np
import pandas as pd

# Change pandas print options so we can print all desired rows/columns without truncation

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

Read in the data

In [2]:
df = pd.read_csv(
    '../input/all_small.csv.gz',
    compression='gzip',
    low_memory=True,
    nrows=1000
)

### Have a first look

In [3]:
df.shape

(1000, 5)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 137079 to 14803731
Data columns (total 5 columns):
amountRequested      1000 non-null float64
reason               998 non-null object
salaryImpactRatio    1000 non-null float64
termOfService        955 non-null object
accepted             1000 non-null int64
dtypes: float64(2), int64(1), object(2)
memory usage: 46.9+ KB


In [5]:
df.head(3)

Unnamed: 0,amountRequested,reason,salaryImpactRatio,termOfService,accepted
137079,2700.0,Debt consolidation,2.0,4 years,1
3470514,10000.0,home_improvement,31.35,< 1 year,0
445678,25000.0,Debt consolidation,4.716981,5 years,1


In [6]:
df.describe()

Unnamed: 0,amountRequested,salaryImpactRatio,accepted
count,1000.0,1000.0,1000.0
mean,13263.9,108.402971,0.095
std,14990.318674,1391.81793,0.293362
min,1000.0,-1.0,0.0
25%,4500.0,4.9675,0.0
50%,10000.0,15.795,0.0
75%,20000.0,31.67,0.0
max,300000.0,38810.75,1.0


In [7]:
categorical_features = []
for column in df.columns:
    if df[column].dtype == np.object:
        categorical_features.append(column)

df[categorical_features].describe()

Unnamed: 0,reason,termOfService
count,998,955
unique,46,11
top,debt_consolidation,< 1 year
freq,327,702


### Amount Requested

In [8]:
df['amountRequested'].head(5)

137079       2700.0
3470514     10000.0
445678      25000.0
13791876     8000.0
15571610     6000.0
Name: amountRequested, dtype: float64

### Reason

In [9]:
df['reason'].describe()

count                    998
unique                    46
top       debt_consolidation
freq                     327
Name: reason, dtype: object

In [10]:
df['reason'].unique()

array(['Debt consolidation', 'home_improvement', 'debt_consolidation',
       'other', 'small_business', 'moving', 'credit_card', 'car',
       'medical', 'house', 'Car financing', 'Credit card refinancing',
       'major_purchase', 'Debt Free', 'Other',
       'Employed, good credit, financing a gift for my fiance!',
       'Business', 'vacation', 'Home improvement',
       'Moving and relocation', 'Business Loan', 'Debt Loan',
       'creditcard/vacation', 'Major purchase', 'Vacation', 'Home buying',
       'Business Line Of Credit', 'Medical expenses', 'Credit Cards',
       'consolidate', 'Wedding Ring Loan', 'Credit card debt',
       'renewable_energy', 'Minor consolidation loan',
       'Debt Consolidation', ' Consolidation',
       'Paying off Americanexpress card ', 'Credit Card Refinance',
       'Purchasing Foreclosure', 'Consolidate',
       'Credit card consolidation', nan, ' ', 'my loan',
       'paying all debts', 'Green loan', 'Retail Business Expansion Loan'],
      dt

### Salary Impact Ratio

In [11]:
df['salaryImpactRatio'].describe()

count     1000.000000
mean       108.402971
std       1391.817930
min         -1.000000
25%          4.967500
50%         15.795000
75%         31.670000
max      38810.750000
Name: salaryImpactRatio, dtype: float64

### Term of Service

In [12]:
df['termOfService'].unique()

array(['4 years', '< 1 year', '5 years', nan, '1 year', '10+ years',
       '6 years', '2 years', '9 years', '8 years', '3 years', '7 years'],
      dtype=object)