In [13]:
!pip install -U featuretools

Collecting featuretools
  Downloading featuretools-0.20.0-py3-none-any.whl (287 kB)
Installing collected packages: featuretools
Successfully installed featuretools-0.20.0


In [14]:
import pandas as pd
import numpy as np
import featuretools as ft

import warnings
warnings.filterwarnings('ignore')

from datetime import datetime
import random

rand_dates = []
 
for _ in range(200):
  
  year = random.choice(range(2010, 2021))
  month = random.choice(range(1, 13))
  day = random.choice(range(1, 29))
  rdate = datetime(year, month, day)
  rand_dates.append(rdate)

In [4]:
clients = pd.DataFrame(columns = ['client_id', 'joined', 'income', 'credit_score'])
for _ in range(40):
  clients = clients.append(pd.DataFrame({'client_id': np.random.randint(25000, 50000, size = 1)[0], 'joined': random.choice(rand_dates),
                           'income': np.random.randint(30500, 240000, size = 1)[0], 'credit_score': np.random.randint(500, 850, size = 1)[0]},
                                        index = [0]), ignore_index = True)

clients.head()

Unnamed: 0,client_id,joined,income,credit_score
0,41813,2016-07-17,86522,504
1,48481,2011-08-18,41505,709
2,28309,2014-11-26,145550,648
3,46279,2018-03-20,151167,532
4,26617,2015-03-10,239990,538


In [6]:
loans = pd.DataFrame(columns = ['client_id', 'loan_type', 'loan_amount', 'repaid',
                                         'loan_id', 'loan_start', 'loan_end', 'rate'])

for client in clients['client_id'].unique():
  for _ in range(40):
    time_created = pd.datetime(np.random.randint(2010, 2021, size = 1)[0],
                               np.random.randint(1, 13, size = 1)[0],
                               np.random.randint(1, 29, size = 1)[0])

    time_ended = time_created + pd.Timedelta(days = np.random.randint(500, 1000, size = 1)[0])

    loans = loans.append(pd.DataFrame({'client_id': client, 'loan_type': random.choice(['cash', 'credit', 'home', 'other']),
                                                         'loan_amount': np.random.randint(500, 15000, size = 1)[0],
                                                         'repaid': random.choice([0, 1]), 
                                                         'loan_id': np.random.randint(10000, 12000, size = 1)[0],
                                                         'loan_start': time_created,
                                                         'loan_end': time_ended,
                                                          'rate': round(abs(4 * np.random.randn(1)[0]), 2)}, index = [0]), ignore_index = True)


  time_created = pd.datetime(np.random.randint(2010, 2021, size = 1)[0],


In [8]:
loans.head()

Unnamed: 0,client_id,loan_type,loan_amount,repaid,loan_id,loan_start,loan_end,rate
0,41813,credit,14985,0,10881,2014-02-23,2015-07-27,2.52
1,41813,cash,10259,1,11752,2010-06-20,2012-06-16,2.97
2,41813,cash,14620,0,11191,2016-03-12,2017-11-18,3.68
3,41813,other,10807,0,10279,2013-11-18,2015-04-08,0.64
4,41813,other,13314,1,10769,2011-01-05,2013-08-02,6.44


In [9]:
payments = pd.DataFrame(columns = ['loan_id', 'payment_amount', 
                                    'payment_date', 'missed'])

for _, row in loans.iterrows():
  time_created = row['loan_start']
  payment_date = time_created + pd.Timedelta(days = 30)
  loan_amount = row['loan_amount']
  loan_id = row['loan_id']
  payment_id = np.random.randint(10000, 12000, size = 1)[0]
  for _ in range(np.random.randint(5, 10, size = 1)[0]):
    payment_id += 1
    payment_date += pd.Timedelta(days = np.random.randint(10, 50, size = 1)[0])
    payments = payments.append(pd.DataFrame({'loan_id': loan_id, 
                                                               'payment_amount': np.random.randint(int(loan_amount / 10), int(loan_amount / 5), size = 1)[0],
                                                               'payment_date': payment_date, 'missed': random.choice([0, 1])}, index = [0]), ignore_index = True)
    

In [10]:
payments.head()

Unnamed: 0,loan_id,payment_amount,payment_date,missed
0,10881,2987,2014-04-22,0
1,10881,2696,2014-05-28,1
2,10881,2924,2014-07-14,0
3,10881,2734,2014-08-05,0
4,10881,2534,2014-08-18,0


In [11]:
clients = clients.drop_duplicates(subset = 'client_id')
loans = loans.drop_duplicates(subset = 'loan_id')


clients.to_csv('clients.csv', index = False)
loans.to_csv('loans.csv', index = False)
payments.to_csv('payments.csv', index = False)

In [15]:
clients = pd.read_csv('C:/Users/DELL/Desktop/clients.csv', parse_dates = ['joined'])
loans = pd.read_csv('C:/Users/DELL/Desktop/loans.csv', parse_dates = ['loan_start', 'loan_end'])
payments = pd.read_csv('C:/Users/DELL/Desktop/payments.csv', parse_dates = ['payment_date'])

In [16]:
es = ft.EntitySet(id = 'clients')

In [17]:
es = es.entity_from_dataframe(entity_id = 'clients', dataframe = clients, 
                              index = 'client_id', time_index = 'joined')

In [18]:
es = es.entity_from_dataframe(entity_id = 'loans', dataframe = loans, 
                              variable_types = {'repaid': ft.variable_types.Categorical},
                              index = 'loan_id', 
                              time_index = 'loan_start')

In [19]:
es = es.entity_from_dataframe(entity_id = 'payments', 
                              dataframe = payments,
                              variable_types = {'missed': ft.variable_types.Categorical},
                              make_index = True,
                              index = 'payment_id',
                              time_index = 'payment_date')

In [20]:
print(es)

Entityset: clients
  Entities:
    clients [Rows: 40, Columns: 4]
    loans [Rows: 1119, Columns: 8]
    payments [Rows: 11141, Columns: 5]
  Relationships:
    No relationships


In [21]:
es['payments']

Entity: payments
  Variables:
    payment_id (dtype: index)
    loan_id (dtype: numeric)
    payment_amount (dtype: numeric)
    payment_date (dtype: datetime_time_index)
    missed (dtype: categorical)
  Shape:
    (Rows: 11141, Columns: 5)

In [22]:
r_client_previous = ft.Relationship(es['clients']['client_id'],
                                    es['loans']['client_id'])

es = es.add_relationship(r_client_previous)

In [23]:
r_payments = ft.Relationship(es['loans']['loan_id'],
                                      es['payments']['loan_id'])

es = es.add_relationship(r_payments)

es

Entityset: clients
  Entities:
    clients [Rows: 40, Columns: 4]
    loans [Rows: 1119, Columns: 8]
    payments [Rows: 11141, Columns: 5]
  Relationships:
    loans.client_id -> clients.client_id
    payments.loan_id -> loans.loan_id

In [24]:
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
primitives[primitives['type'] == 'aggregation'].head(10)

Unnamed: 0,name,type,dask_compatible,koalas_compatible,description
0,skew,aggregation,False,False,Computes the extent to which a distribution differs from a normal distribution.
1,first,aggregation,False,False,Determines the first value in a list.
2,all,aggregation,True,False,Calculates if all values are 'True' in a list.
3,mode,aggregation,False,False,Determines the most commonly repeated value.
4,avg_time_between,aggregation,False,False,Computes the average number of seconds between consecutive events.
5,entropy,aggregation,False,False,Calculates the entropy for a categorical variable
6,any,aggregation,True,False,Determines if any value is 'True' in a list.
7,n_most_common,aggregation,False,False,Determines the `n` most common elements.
8,sum,aggregation,True,True,"Calculates the total addition, ignoring `NaN`."
9,time_since_first,aggregation,False,False,Calculates the time elapsed since the first datetime (in seconds).


In [25]:
primitives[primitives['type'] == 'transform'].head(10)

Unnamed: 0,name,type,dask_compatible,koalas_compatible,description
22,time_since_previous,transform,False,False,Compute the time since the previous entry in a list.
23,cum_max,transform,False,False,Calculates the cumulative maximum.
24,and,transform,True,True,Element-wise logical AND of two lists.
25,add_numeric_scalar,transform,True,True,Add a scalar to each value in the list.
26,add_numeric,transform,True,True,Element-wise addition of two lists.
27,not,transform,True,True,Negates a boolean value.
28,weekday,transform,True,True,Determines the day of the week from a datetime.
29,less_than_equal_to,transform,True,True,Determines if values in one list are less than or equal to another list.
30,divide_by_feature,transform,True,True,Divide a scalar by each value in the list.
31,haversine,transform,False,False,Calculates the approximate haversine distance between two LatLong


In [27]:
features, feature_names = ft.dfs(entityset=es, target_entity='clients', 
                                 max_depth = 2)

In [28]:
features

Unnamed: 0_level_0,income,credit_score,COUNT(loans),MAX(loans.loan_amount),MAX(loans.rate),MEAN(loans.loan_amount),MEAN(loans.rate),MIN(loans.loan_amount),MIN(loans.rate),MODE(loans.loan_type),...,MODE(payments.loans.repaid),NUM_UNIQUE(payments.loans.client_id),NUM_UNIQUE(payments.loans.loan_type),NUM_UNIQUE(payments.loans.repaid),SKEW(payments.loans.loan_amount),SKEW(payments.loans.rate),STD(payments.loans.loan_amount),STD(payments.loans.rate),SUM(payments.loans.loan_amount),SUM(payments.loans.rate)
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34974,63930,548,17,14460,5.41,7282.882353,2.158235,1120,0.37,credit,...,1,1,4,2,0.19202,0.945537,4625.16855,1.373119,1079619,282.22
41510,49744,716,19,13226,10.05,6427.526316,3.495789,1164,0.09,other,...,0,1,4,2,0.213491,0.840473,3383.564774,2.737305,1086447,568.36
45949,172388,534,31,14547,7.46,7126.0,2.643548,815,0.01,home,...,1,1,4,2,0.269074,0.754471,3859.879437,1.951966,1941046,810.43
27112,216687,701,25,13386,11.12,6626.44,4.1548,1047,0.54,cash,...,0,1,4,2,0.175121,0.708472,4353.573055,2.910543,1282122,756.96
44583,234816,514,22,13549,7.07,6049.545455,2.89,834,0.25,credit,...,1,1,4,2,0.428204,0.38947,3709.74877,1.688805,916707,439.88
48481,41505,709,37,14676,9.96,7726.972973,3.483243,553,0.15,cash,...,0,1,4,2,-0.148486,0.862642,4389.568443,2.260566,3398725,1477.22
42415,63454,717,19,14717,8.7,7805.631579,2.963158,1268,0.28,credit,...,1,1,4,2,0.118909,1.07402,4635.821559,2.46406,1021969,375.74
32898,37124,751,32,13804,6.34,7176.21875,2.66125,681,0.19,cash,...,1,1,4,2,0.317647,0.589542,4026.946312,1.627403,2097399,876.71
27411,99179,603,24,14829,9.2,7211.208333,2.408333,873,0.08,other,...,1,1,4,2,0.017581,2.078315,4779.155585,1.627789,1439264,408.37
27653,148849,610,32,14125,10.64,6965.75,3.415312,625,0.03,other,...,1,1,4,2,-0.008418,0.687682,4267.327258,2.582033,1869705,999.83


In [30]:
features.shape

(39, 110)