In [1]:
import sys
!{sys.executable} -m pip install deepchecks -U --quiet

[K     |████████████████████████████████| 3.6 MB 5.0 MB/s 
[K     |████████████████████████████████| 40 kB 4.7 MB/s 
[K     |████████████████████████████████| 128 kB 71.6 MB/s 
[K     |████████████████████████████████| 69 kB 6.1 MB/s 
[K     |████████████████████████████████| 793 kB 71.8 MB/s 
[K     |████████████████████████████████| 9.4 MB 51.9 MB/s 
[K     |████████████████████████████████| 1.6 MB 68.8 MB/s 
[K     |████████████████████████████████| 965 kB 60.4 MB/s 
[K     |████████████████████████████████| 295 kB 76.9 MB/s 
[?25h  Building wheel for PyNomaly (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires ipython~=7.9.0, but you have ipython 7.34.0 which is incompatible.[0m


In [2]:
from deepchecks.tabular.datasets.classification import lending_club
import pandas as pd

data = lending_club.load_data(data_format='Dataframe', as_train_test=False)
data.head(2)

Unnamed: 0,issue_d,sub_grade,term,home_ownership,fico_range_low,total_acc,pub_rec,revol_util,annual_inc,int_rate,...,pub_rec_bankruptcies,addr_state,initial_list_status,fico_range_high,revol_bal,id,open_acc,emp_length,loan_status,time_to_earliest_cr_line
0,2017-06-01,D1,36 months,MORTGAGE,665.0,29.0,0.0,85.0,112600.0,17.09,...,0.0,CO,w,669.0,25779.0,110680237,13.0,2.0,0,794188.8
1,2017-06-01,C2,36 months,RENT,670.0,14.0,0.0,34.8,35000.0,13.59,...,,FL,f,674.0,3798.0,109936186,7.0,2.0,1,470793.6


In [3]:
# convert date column to datetime, `issue_d`` is date column
data['issue_d'] = pd.to_datetime(data['issue_d'])

# Use data from June and July for train and August for test:
train_df = data[data['issue_d'].dt.month.isin([6, 7])]
test_df = data[data['issue_d'].dt.month.isin([8])]

Define Lending Club Metadata

In [4]:
categorical_features = ['addr_state', 'application_type', 'home_ownership', \
  'initial_list_status', 'purpose', 'term', 'verification_status', 'sub_grade']
index_name = 'id'
label = 'loan_status' # 0 is DEFAULT, 1 is OK
datetime_name = 'issue_d'

Create Dataset

In [5]:
from deepchecks.tabular import Dataset

# Categorical features can be heuristically inferred, however we
# recommend to state them explicitly to avoid misclassification.

# Metadata attributes are optional. Some checks will run only if specific attributes are declared.

train_ds = Dataset(train_df, label=label,cat_features=categorical_features, \
                   index_name=index_name, datetime_name=datetime_name)
test_ds = Dataset(test_df, label=label,cat_features=categorical_features, \
                   index_name=index_name, datetime_name=datetime_name)

In [6]:
# for convenience lets save it in a dictionary so we can reuse them for future Dataset initializations
columns_metadata = {'cat_features' : categorical_features, 'index_name': index_name,
                    'label':label, 'datetime_name':datetime_name}

Run the Deepchecks Suite

In [7]:
from deepchecks.tabular.suites import train_test_validation

validation_suite = train_test_validation()
suite_result = validation_suite.run(train_ds, test_ds)
# Note: the result can be saved as html using suite_result.save_as_html()
# or exported to json using suite_result.to_json()
suite_result



As you can see in the suite’s results: the Date Train-Test Leakage check failed, indicating that we may have a problem in the way we’ve split our data! We’ve mixed up data from two years, causing a leakage of future data in the training dataset. Let’s fix this.

Fix Data

In [8]:
dt_col = data[datetime_name]
train_df = data[dt_col.dt.year.isin([2017]) & dt_col.dt.month.isin([6,7,8])]
test_df = data[dt_col.dt.year.isin([2018]) & dt_col.dt.month.isin([6,7,8])]

In [9]:
from deepchecks.tabular import Dataset

# Create the new Datasets
train_ds = Dataset(train_df, **columns_metadata)
test_ds = Dataset(test_df, **columns_metadata)

Re-run Validation Suite

In [10]:
suite_result = validation_suite.run(train_ds, test_ds)
suite_result.show()

