### Prepare datasets for analysis

## Lending Club Analysis

Import basic libraries

In [1]:
import numpy as np
import pandas as pd

# Change pandas print options so we can print all desired rows/columns without truncation

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

Read in the data

In [2]:
df = pd.read_csv(
    '../input/all_classification.csv.gz',
    compression='gzip',
    low_memory=True
)

### Have a first look

In [3]:
df.shape

(17778273, 5)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17778273 entries, 5939787 to 7178136
Data columns (total 5 columns):
amountRequested      float64
reason               object
salaryImpactRatio    float64
termOfService        object
accepted             int64
dtypes: float64(2), int64(1), object(2)
memory usage: 813.8+ MB


In [5]:
df.head(10)

Unnamed: 0,amountRequested,reason,salaryImpactRatio,termOfService,accepted
5939787,5000.0,1,10.8,< 1 year,0
5167229,2500.0,1,-1.0,< 1 year,0
3424228,11500.0,1,51.1,< 1 year,0
67318,15000.0,2,32.04,< 1 year,0
6158569,15000.0,1,31.23,< 1 year,0
9873121,5000.0,0,4.32,5 years,0
92773,20000.0,3,310.16,< 1 year,0
16119973,15000.0,1,0.0,< 1 year,0
8308276,10000.0,1,4.43,< 1 year,0
327155,5500.0,2,57.24,< 1 year,0


In [6]:
df.describe()

Unnamed: 0,amountRequested,salaryImpactRatio,accepted
count,17778250.0,17778250.0,17778270.0
mean,13410.43,inf,0.09262998
std,15330.65,,0.2899132
min,0.0,-1.0,0.0
25%,5000.0,5.333333,0.0
50%,10000.0,16.14,0.0
75%,20000.0,32.23,0.0
max,1400000.0,inf,1.0


In [7]:
categorical_features = []
for column in df.columns:
    if df[column].dtype == np.object:
        categorical_features.append(column)

df[categorical_features].describe()

Unnamed: 0,reason,termOfService
count,17778250,17056098
unique,25,11
top,1,< 1 year
freq,7529872,12613190


In [8]:
df['reason'].unique()

array(['1', '2', '0', '3', '4', '5', 'debt_consolidation', '9',
       'credit_card', 'other', '6', '7', '8', 'home_improvement', '10',
       'medical', 'major_purchase', 'moving', 'small_business', 'car',
       'vacation', 'wedding', 'house', 'educational', 'renewable_energy',
       nan], dtype=object)

In [10]:
loan_mapping = {
    "other": 0,
    "debt_consolidation": 1,
    "credit_card": 2,
    "home_improvement": 3,
    "car": 4,
    "major_purchase": 5,
    "moving": 6,
    "medical": 7,
    "small_business": 8,
    "Business Loan": 8,
    "house": 9,
    "vacation": 10,
    "wedding": 0,
    "educational": 0,
    "renewable_energy": 0
}

df['reason'] = df['reason'].map(loan_mapping).fillna(0).astype(int)