_Lambda School Data Science_

# Make features

Objectives
-  understand the purpose of feature engineering
-  work with strings in pandas
- work with dates and times in pandas

Links
- [Feature Engineering](https://en.wikipedia.org/wiki/Feature_engineering)
- Python Data Science Handbook
  - [Chapter 3.10](https://jakevdp.github.io/PythonDataScienceHandbook/03.10-working-with-strings.html), Vectorized String Operations
  - [Chapter 3.11](https://jakevdp.github.io/PythonDataScienceHandbook/03.11-working-with-time-series.html), Working with Time Series

## Get LendingClub data

In [None]:
!wget https://resources.lendingclub.com/LoanStats_2018Q3.csv.zip

--2019-01-17 10:20:52--  https://resources.lendingclub.com/LoanStats_2018Q3.csv.zip
Resolving resources.lendingclub.com (resources.lendingclub.com)... 64.48.1.20
Connecting to resources.lendingclub.com (resources.lendingclub.com)|64.48.1.20|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘LoanStats_2018Q3.csv.zip.1’

.zip.1                  [    <=>             ]  12.78M  2.10MB/s               

In [1]:
!unzip LoanStats_2018Q3.csv.zip

unzip:  cannot find or open LoanStats_2018Q3.csv.zip, LoanStats_2018Q3.csv.zip.zip or LoanStats_2018Q3.csv.zip.ZIP.


In [2]:
!head LoanStats_2018Q3.csv

Notes offered by Prospectus (https://www.lendingclub.com/info/prospectus.action)
"id","member_id","loan_amnt","funded_amnt","funded_amnt_inv","term","int_rate","installment","grade","sub_grade","emp_title","emp_length","home_ownership","annual_inc","verification_status","issue_d","loan_status","pymnt_plan","url","desc","purpose","title","zip_code","addr_state","dti","delinq_2yrs","earliest_cr_line","inq_last_6mths","mths_since_last_delinq","mths_since_last_record","open_acc","pub_rec","revol_bal","revol_util","total_acc","initial_list_status","out_prncp","out_prncp_inv","total_pymnt","total_pymnt_inv","total_rec_prncp","total_rec_int","total_rec_late_fee","recoveries","collection_recovery_fee","last_pymnt_d","last_pymnt_amnt","next_pymnt_d","last_credit_pull_d","collections_12_mths_ex_med","mths_since_last_major_derog","policy_code","application_type","annual_inc_joint","dti_joint","verification_status_joint","acc_now_delinq","tot_coll_amt","tot_cur_bal","open_acc_6m","open_act_il","o

## Load LendingClub data

In [3]:
import pandas as pd
df = pd.read_csv('LoanStats_2018Q3.csv', skiprows=1,skipfooter=2,engine='python')

df.shape

(128194, 145)

In [4]:
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
df.head().T

Unnamed: 0,0,1,2,3,4
id,,,,,
member_id,,,,,
loan_amnt,20000,25000,30000,6000,10650
funded_amnt,20000,25000,30000,6000,10650
funded_amnt_inv,20000,25000,30000,6000,10650
term,60 months,60 months,36 months,36 months,36 months
int_rate,17.97%,13.56%,18.94%,7.84%,7.84%
installment,507.55,576.02,1098.78,187.58,332.95
grade,D,C,D,A,A
sub_grade,D1,C1,D2,A4,A4


In [5]:
df['hardship_payoff_balance_amount'].isnull().sum() / len(df)

0.999898591197716

## Work with strings

In [6]:
import numpy as np

def all_numeric(df):
    return all((df.dtypes==np.number) |
               (df.dtypes==bool))

def no_nulls(df):
    return not any(df.isnull().sum())

def ready_for_sklearn(df):
    return all_numeric(df) and no_nulls(df)

In [7]:
ready_for_sklearn(df)

False

In [9]:
all_numeric(df)

False

In [10]:
df.select_dtypes('object').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128194 entries, 0 to 128193
Data columns (total 37 columns):
term                         128194 non-null object
int_rate                     128194 non-null object
grade                        128194 non-null object
sub_grade                    128194 non-null object
emp_title                    114757 non-null object
emp_length                   117807 non-null object
home_ownership               128194 non-null object
verification_status          128194 non-null object
issue_d                      128194 non-null object
loan_status                  128194 non-null object
pymnt_plan                   128194 non-null object
purpose                      128194 non-null object
title                        128194 non-null object
zip_code                     128194 non-null object
addr_state                   128194 non-null object
earliest_cr_line             128194 non-null object
revol_util                   128065 non-null object
initi

In [13]:
df.int_rate.head().values

array([' 17.97%', ' 13.56%', ' 18.94%', '  7.84%', '  7.84%'],
      dtype=object)

In [162]:
def remove_percent(string):
    return float(string.strip('%'))

In [15]:
df['int_rate'] = df['int_rate'].apply(remove_percent)

In [16]:
df['int_rate'].head()

0    17.97
1    13.56
2    18.94
3     7.84
4     7.84
Name: int_rate, dtype: float64

In [17]:
df['emp_title'].value_counts().head(20)

Teacher               2294
Manager               2075
Owner                 1231
Driver                1089
Registered Nurse       944
Supervisor             810
RN                     757
Sales                  726
Project Manager        637
General Manager        548
Office Manager         542
Director               482
owner                  398
Engineer               383
Truck Driver           367
Operations Manager     366
President              350
Sales Manager          323
Supervisor             321
Server                 319
Name: emp_title, dtype: int64

In [18]:
df['emp_title'].value_counts().head(20).index

Index(['Teacher', 'Manager', 'Owner', 'Driver', 'Registered Nurse',
       'Supervisor', 'RN', 'Sales', 'Project Manager', 'General Manager',
       'Office Manager', 'Director', 'owner', 'Engineer', 'Truck Driver',
       'Operations Manager', 'President', 'Sales Manager', 'Supervisor ',
       'Server'],
      dtype='object')

In [19]:
df['emp_title'].isnull().sum()

13437

In [26]:
examples = ['owner', 'Supervisor', 'Project manager',
           np.nan]

def clean_title(x):
    if isinstance(x,str):
        return x.strip().title()
    else:
        return 'Unknown'

for example in examples:
    print(clean_title(example))

Owner
Supervisor
Project Manager
Unknown


In [27]:
df['emp_title'] = df['emp_title'].apply(clean_title)

In [28]:
df['emp_title'].value_counts().head(20)

Unknown                     13437
Teacher                      2843
Manager                      2749
Owner                        1856
Driver                       1498
Registered Nurse             1386
Supervisor                   1345
Sales                         980
Truck Driver                  921
Rn                            905
Office Manager                846
Project Manager               835
General Manager               809
Director                      585
Operations Manager            516
Sales Manager                 510
Engineer                      474
Administrative Assistant      466
Store Manager                 466
President                     464
Name: emp_title, dtype: int64

In [29]:
df['emp_title_manager'] = df['emp_title'].str.contains('Manager')

In [30]:
df['emp_title_manager'].value_counts()

False    109498
True      18696
Name: emp_title_manager, dtype: int64

In [31]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term,emp_title_manager
0,,,20000,20000,20000,60 months,17.97,507.55,D,D1,Motor Vehicle Operator,7 years,RENT,68000.0,Source Verified,Sep-2018,Current,n,,,debt_consolidation,Debt consolidation,254xx,WV,15.8,0,Feb-2009,1,30.0,118.0,11,1,13483,69.1%,26,w,19580.78,19580.78,975.17,975.17,419.22,555.95,0.0,0.0,0.0,Dec-2018,507.55,Jan-2019,Dec-2018,0,,1,Individual,,,,0,0,194908,0,2,0,2,17.0,45299,93.0,0,1,9815,86.0,19500,1,4,2,5,19491.0,2185.0,81.8,0,0,115.0,94,13,7,4,25.0,,2.0,30.0,0,1,3,2,4,11,8,10,3,11,0.0,0,0,1,92.3,50.0,1,0,205625,58782,12000,48733,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,,False
1,,,25000,25000,25000,60 months,13.56,576.02,C,C1,Firefighter,7 years,MORTGAGE,61000.0,Verified,Sep-2018,Current,n,,,major_purchase,Major purchase,770xx,TX,16.74,0,Jan-2009,0,,,5,0,6299,48.5%,14,w,24109.45,24109.45,1567.98,1567.98,890.55,677.43,0.0,0.0,0.0,Dec-2018,576.02,Jan-2019,Dec-2018,0,,1,Individual,,,,0,0,204007,1,2,0,1,19.0,13862,37.0,0,0,0,40.0,13000,0,7,4,2,40801.0,1500.0,0.0,0,0,54.0,116,26,3,1,74.0,,4.0,,0,0,1,1,2,5,2,8,1,5,0.0,0,0,1,100.0,0.0,0,0,234573,20161,1500,37273,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,False
2,,,30000,30000,30000,36 months,18.94,1098.78,D,D2,Unknown,< 1 year,RENT,100000.0,Source Verified,Sep-2018,Current,n,,,debt_consolidation,Debt consolidation,300xx,GA,16.07,0,Mar-2008,1,,114.0,6,1,14574,70.1%,9,w,28739.57,28739.57,2134.43,2134.43,1260.43,874.0,0.0,0.0,0.0,Dec-2018,1098.78,Jan-2019,Dec-2018,0,,1,Individual,,,,0,0,44048,1,2,1,1,1.0,29474,69.0,1,2,8521,69.0,20800,1,0,2,3,8810.0,5226.0,73.6,0,0,126.0,104,11,1,0,17.0,,1.0,,0,2,2,2,2,4,4,5,2,6,0.0,0,0,2,100.0,0.0,1,0,63636,44048,19800,42836,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,False
3,,,6000,6000,6000,36 months,7.84,187.58,A,A4,Unknown,,RENT,30000.0,Not Verified,Sep-2018,Current,n,,,debt_consolidation,Debt consolidation,923xx,CA,5.44,0,Apr-2000,0,,104.0,8,1,5936,34.5%,11,w,5702.27,5702.27,369.93,369.93,297.73,72.2,0.0,0.0,0.0,Dec-2018,187.58,Jan-2019,Dec-2018,0,,1,Individual,,,,0,350,5936,0,0,0,1,23.0,0,,1,4,2913,35.0,17200,2,0,0,5,848.0,7698.0,35.9,0,0,139.0,221,7,7,0,7.0,,18.0,,0,2,3,4,4,2,8,9,3,8,0.0,0,0,1,100.0,33.3,1,0,17200,5936,12000,0,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,,False
4,,,10650,10650,10650,36 months,7.84,332.95,A,A4,Unknown,,RENT,28000.0,Verified,Sep-2018,Current,n,,,medical,Medical expenses,430xx,OH,16.89,0,Nov-2002,0,,,3,0,37,0.3%,3,w,10121.54,10121.54,656.62,656.62,528.46,128.16,0.0,0.0,0.0,Dec-2018,332.95,Jan-2019,Dec-2018,0,,1,Joint App,43000.0,14.01,Source Verified,0,0,18254,0,1,0,1,16.0,18217,81.0,0,0,0,54.0,11500,1,1,0,1,6085.0,,,0,0,16.0,190,113,16,0,,,16.0,,0,0,1,0,0,1,2,2,2,3,0.0,0,0,0,100.0,,0,0,33876,18254,0,22376,2024.0,Oct-1996,0.0,0.0,8.0,18.7,1.0,9.0,0.0,0.0,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,False


In [33]:
df.shape

(128194, 146)

## Work with dates

In [34]:
df['issue_d'].head().values

array(['Sep-2018', 'Sep-2018', 'Sep-2018', 'Sep-2018', 'Sep-2018'],
      dtype=object)

In [35]:
df['issue_d'] = pd.to_datetime(df['issue_d'],
                               infer_datetime_format=True)

In [36]:
df['issue_d'].head().values

array(['2018-09-01T00:00:00.000000000', '2018-09-01T00:00:00.000000000',
       '2018-09-01T00:00:00.000000000', '2018-09-01T00:00:00.000000000',
       '2018-09-01T00:00:00.000000000'], dtype='datetime64[ns]')

In [37]:
df['issue_d'].describe()

count                  128194
unique                      3
top       2018-08-01 00:00:00
freq                    46079
first     2018-07-01 00:00:00
last      2018-09-01 00:00:00
Name: issue_d, dtype: object

In [38]:
df['issue_year'] = df['issue_d'].dt.year
df['issue_month']= df['issue_d'].dt.month

In [39]:
df['issue_month'].sample(n=10).values

array([8, 7, 9, 9, 8, 9, 8, 7, 7, 7])

In [40]:
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'],
                                        infer_datetime_format=True)

In [41]:
df['days_from_earliest_credit_to_issue'] = (
    df['issue_d'] - df['earliest_cr_line']).dt.days

In [43]:
df['days_from_earliest_credit_to_issue'].head().values

array([3499, 3530, 3836, 6727, 5783])

In [44]:
[col for col in df if col.endswith('_d')]

['issue_d', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d']

In [45]:
for col in ['last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d']:
    df[col] = pd.to_datetime(df[col], 
                             infer_datetime_format=True)

In [48]:
df['loan_status'].value_counts()

Current               121082
Fully Paid              4786
In Grace Period          948
Late (31-120 days)       920
Late (16-30 days)        348
Charged Off              110
Name: loan_status, dtype: int64

# ASSIGNMENT

Replicate the lesson code.

Convert the term column from string to integer.

Make a column named loan_status_is_great. It should contain the integer 1 if loan_status is "Current" or "Fully Paid." Else it should contain the integer 0.

Make last_pymnt_d_month and last_pymnt_d_year columns.

In [49]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term,emp_title_manager,issue_year,issue_month,days_from_earliest_credit_to_issue
0,,,20000,20000,20000,60 months,17.97,507.55,D,D1,Motor Vehicle Operator,7 years,RENT,68000.0,Source Verified,2018-09-01,Current,n,,,debt_consolidation,Debt consolidation,254xx,WV,15.8,0,2009-02-01,1,30.0,118.0,11,1,13483,69.1%,26,w,19580.78,19580.78,975.17,975.17,419.22,555.95,0.0,0.0,0.0,2018-12-01,507.55,2019-01-01,2018-12-01,0,,1,Individual,,,,0,0,194908,0,2,0,2,17.0,45299,93.0,0,1,9815,86.0,19500,1,4,2,5,19491.0,2185.0,81.8,0,0,115.0,94,13,7,4,25.0,,2.0,30.0,0,1,3,2,4,11,8,10,3,11,0.0,0,0,1,92.3,50.0,1,0,205625,58782,12000,48733,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,,False,2018,9,3499
1,,,25000,25000,25000,60 months,13.56,576.02,C,C1,Firefighter,7 years,MORTGAGE,61000.0,Verified,2018-09-01,Current,n,,,major_purchase,Major purchase,770xx,TX,16.74,0,2009-01-01,0,,,5,0,6299,48.5%,14,w,24109.45,24109.45,1567.98,1567.98,890.55,677.43,0.0,0.0,0.0,2018-12-01,576.02,2019-01-01,2018-12-01,0,,1,Individual,,,,0,0,204007,1,2,0,1,19.0,13862,37.0,0,0,0,40.0,13000,0,7,4,2,40801.0,1500.0,0.0,0,0,54.0,116,26,3,1,74.0,,4.0,,0,0,1,1,2,5,2,8,1,5,0.0,0,0,1,100.0,0.0,0,0,234573,20161,1500,37273,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,False,2018,9,3530
2,,,30000,30000,30000,36 months,18.94,1098.78,D,D2,Unknown,< 1 year,RENT,100000.0,Source Verified,2018-09-01,Current,n,,,debt_consolidation,Debt consolidation,300xx,GA,16.07,0,2008-03-01,1,,114.0,6,1,14574,70.1%,9,w,28739.57,28739.57,2134.43,2134.43,1260.43,874.0,0.0,0.0,0.0,2018-12-01,1098.78,2019-01-01,2018-12-01,0,,1,Individual,,,,0,0,44048,1,2,1,1,1.0,29474,69.0,1,2,8521,69.0,20800,1,0,2,3,8810.0,5226.0,73.6,0,0,126.0,104,11,1,0,17.0,,1.0,,0,2,2,2,2,4,4,5,2,6,0.0,0,0,2,100.0,0.0,1,0,63636,44048,19800,42836,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,False,2018,9,3836
3,,,6000,6000,6000,36 months,7.84,187.58,A,A4,Unknown,,RENT,30000.0,Not Verified,2018-09-01,Current,n,,,debt_consolidation,Debt consolidation,923xx,CA,5.44,0,2000-04-01,0,,104.0,8,1,5936,34.5%,11,w,5702.27,5702.27,369.93,369.93,297.73,72.2,0.0,0.0,0.0,2018-12-01,187.58,2019-01-01,2018-12-01,0,,1,Individual,,,,0,350,5936,0,0,0,1,23.0,0,,1,4,2913,35.0,17200,2,0,0,5,848.0,7698.0,35.9,0,0,139.0,221,7,7,0,7.0,,18.0,,0,2,3,4,4,2,8,9,3,8,0.0,0,0,1,100.0,33.3,1,0,17200,5936,12000,0,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,,False,2018,9,6727
4,,,10650,10650,10650,36 months,7.84,332.95,A,A4,Unknown,,RENT,28000.0,Verified,2018-09-01,Current,n,,,medical,Medical expenses,430xx,OH,16.89,0,2002-11-01,0,,,3,0,37,0.3%,3,w,10121.54,10121.54,656.62,656.62,528.46,128.16,0.0,0.0,0.0,2018-12-01,332.95,2019-01-01,2018-12-01,0,,1,Joint App,43000.0,14.01,Source Verified,0,0,18254,0,1,0,1,16.0,18217,81.0,0,0,0,54.0,11500,1,1,0,1,6085.0,,,0,0,16.0,190,113,16,0,,,16.0,,0,0,1,0,0,1,2,2,2,3,0.0,0,0,0,100.0,,0,0,33876,18254,0,22376,2024.0,Oct-1996,0.0,0.0,8.0,18.7,1.0,9.0,0.0,0.0,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,False,2018,9,5783


In [50]:
## term from string to int

df['term'].isnull().sum()

0

In [51]:
def remove_months(string):
    return float(string.strip('months'))
df['term'] = df['term'].apply(remove_months)
df['term'].head()

0    60.0
1    60.0
2    36.0
3    36.0
4    36.0
Name: term, dtype: float64

In [89]:
df['loan_status'].value_counts()

Current               121082
Fully Paid              4786
In Grace Period          948
Late (31-120 days)       920
Late (16-30 days)        348
Charged Off              110
Name: loan_status, dtype: int64

In [90]:
df_copy.shape

(128194, 150)

In [70]:
## Make a column named loan_status_is_great. It should contain the integer 1
## if loan_status is Current" or "Fully Paid." 
## Else it should contain the integer 0.

#first approach lmao
# def is_great(status):

# for status in df_copy:
#     if df_copy.contains('Current', case=False):
#         return status.astype(int)
#     if df_copy.contains('Fully Paid', case=False, regex=True):
#         return status.astype(int)
#     else:
#         return status.astype(int)

In [100]:
df_copy = df.copy()

In [108]:
df_copy['loan_status'].replace(to_replace='Fully Paid', value='Current', inplace=True)

In [109]:
df_copy['loan_status'].value_counts()

Current               125868
In Grace Period          948
Late (31-120 days)       920
Late (16-30 days)        348
Charged Off              110
Name: loan_status, dtype: int64

In [114]:
df['loan_status_is_great'] = df_copy['loan_status'].str.contains('Current').astype(int)
# df_copy['loan_status_is_great'] =

df['loan_status_is_great'].value_counts()

1    125868
0      2326
Name: loan_status_is_great, dtype: int64

In [115]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term,emp_title_manager,issue_year,issue_month,days_from_earliest_credit_to_issue,loan_status_is_great
0,,,20000,20000,20000,60.0,17.97,507.55,D,D1,Motor Vehicle Operator,7 years,RENT,68000.0,Source Verified,2018-09-01,Current,n,,,debt_consolidation,Debt consolidation,254xx,WV,15.8,0,2009-02-01,1,30.0,118.0,11,1,13483,69.1%,26,w,19580.78,19580.78,975.17,975.17,419.22,555.95,0.0,0.0,0.0,2018-12-01,507.55,2019-01-01,2018-12-01,0,,1,Individual,,,,0,0,194908,0,2,0,2,17.0,45299,93.0,0,1,9815,86.0,19500,1,4,2,5,19491.0,2185.0,81.8,0,0,115.0,94,13,7,4,25.0,,2.0,30.0,0,1,3,2,4,11,8,10,3,11,0.0,0,0,1,92.3,50.0,1,0,205625,58782,12000,48733,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,,False,2018,9,3499,1
1,,,25000,25000,25000,60.0,13.56,576.02,C,C1,Firefighter,7 years,MORTGAGE,61000.0,Verified,2018-09-01,Current,n,,,major_purchase,Major purchase,770xx,TX,16.74,0,2009-01-01,0,,,5,0,6299,48.5%,14,w,24109.45,24109.45,1567.98,1567.98,890.55,677.43,0.0,0.0,0.0,2018-12-01,576.02,2019-01-01,2018-12-01,0,,1,Individual,,,,0,0,204007,1,2,0,1,19.0,13862,37.0,0,0,0,40.0,13000,0,7,4,2,40801.0,1500.0,0.0,0,0,54.0,116,26,3,1,74.0,,4.0,,0,0,1,1,2,5,2,8,1,5,0.0,0,0,1,100.0,0.0,0,0,234573,20161,1500,37273,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,False,2018,9,3530,1
2,,,30000,30000,30000,36.0,18.94,1098.78,D,D2,Unknown,< 1 year,RENT,100000.0,Source Verified,2018-09-01,Current,n,,,debt_consolidation,Debt consolidation,300xx,GA,16.07,0,2008-03-01,1,,114.0,6,1,14574,70.1%,9,w,28739.57,28739.57,2134.43,2134.43,1260.43,874.0,0.0,0.0,0.0,2018-12-01,1098.78,2019-01-01,2018-12-01,0,,1,Individual,,,,0,0,44048,1,2,1,1,1.0,29474,69.0,1,2,8521,69.0,20800,1,0,2,3,8810.0,5226.0,73.6,0,0,126.0,104,11,1,0,17.0,,1.0,,0,2,2,2,2,4,4,5,2,6,0.0,0,0,2,100.0,0.0,1,0,63636,44048,19800,42836,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,False,2018,9,3836,1
3,,,6000,6000,6000,36.0,7.84,187.58,A,A4,Unknown,,RENT,30000.0,Not Verified,2018-09-01,Current,n,,,debt_consolidation,Debt consolidation,923xx,CA,5.44,0,2000-04-01,0,,104.0,8,1,5936,34.5%,11,w,5702.27,5702.27,369.93,369.93,297.73,72.2,0.0,0.0,0.0,2018-12-01,187.58,2019-01-01,2018-12-01,0,,1,Individual,,,,0,350,5936,0,0,0,1,23.0,0,,1,4,2913,35.0,17200,2,0,0,5,848.0,7698.0,35.9,0,0,139.0,221,7,7,0,7.0,,18.0,,0,2,3,4,4,2,8,9,3,8,0.0,0,0,1,100.0,33.3,1,0,17200,5936,12000,0,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,,False,2018,9,6727,1
4,,,10650,10650,10650,36.0,7.84,332.95,A,A4,Unknown,,RENT,28000.0,Verified,2018-09-01,Current,n,,,medical,Medical expenses,430xx,OH,16.89,0,2002-11-01,0,,,3,0,37,0.3%,3,w,10121.54,10121.54,656.62,656.62,528.46,128.16,0.0,0.0,0.0,2018-12-01,332.95,2019-01-01,2018-12-01,0,,1,Joint App,43000.0,14.01,Source Verified,0,0,18254,0,1,0,1,16.0,18217,81.0,0,0,0,54.0,11500,1,1,0,1,6085.0,,,0,0,16.0,190,113,16,0,,,16.0,,0,0,1,0,0,1,2,2,2,3,0.0,0,0,0,100.0,,0,0,33876,18254,0,22376,2024.0,Oct-1996,0.0,0.0,8.0,18.7,1.0,9.0,0.0,0.0,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,False,2018,9,5783,1


In [118]:
### Make last_pymnt_d_month and last_pymnt_d_year columns. 
df['last_pymnt_d'].describe()

count                  128048
unique                      6
top       2018-12-01 00:00:00
freq                   116465
first     2018-07-01 00:00:00
last      2018-12-01 00:00:00
Name: last_pymnt_d, dtype: object

In [125]:
df['last_pymnt_d'] = pd.to_datetime(df['last_pymnt_d'], format='%Y/%m/%d') 

In [126]:
df['last_pymnt_d'].describe()

count                  128048
unique                      6
top       2018-12-01 00:00:00
freq                   116465
first     2018-07-01 00:00:00
last      2018-12-01 00:00:00
Name: last_pymnt_d, dtype: object

In [123]:
import datetime as dt

In [133]:
df['last_pymnt_d_month'] = df['last_pymnt_d'].dt.month

In [135]:
df['last_pymnt_d_month']

0         12.0
1         12.0
2         12.0
3         12.0
4         12.0
5         12.0
6         12.0
7         12.0
8         12.0
9         12.0
10        12.0
11        12.0
12        12.0
13        12.0
14        12.0
15        12.0
16        12.0
17        12.0
18        12.0
19        12.0
20        11.0
21        12.0
22        12.0
23        12.0
24        12.0
25        12.0
26        12.0
27        12.0
28        12.0
29        12.0
30        12.0
31        12.0
32        12.0
33        12.0
34        12.0
35        12.0
36        12.0
37        12.0
38        12.0
39        12.0
40        12.0
41        12.0
42        12.0
43        12.0
44        12.0
45        12.0
46        12.0
47        12.0
48        12.0
49        12.0
50        12.0
51        12.0
52        12.0
53        12.0
54        12.0
55        12.0
56        12.0
57        12.0
58        12.0
59        12.0
60        12.0
61        12.0
62        12.0
63        12.0
64        12.0
65        12.0
66        

In [136]:
df['last_pymnt_d_year'] = df['last_pymnt_d'].dt.year

In [137]:
df['last_pymnt_d_year']

0         2018.0
1         2018.0
2         2018.0
3         2018.0
4         2018.0
5         2018.0
6         2018.0
7         2018.0
8         2018.0
9         2018.0
10        2018.0
11        2018.0
12        2018.0
13        2018.0
14        2018.0
15        2018.0
16        2018.0
17        2018.0
18        2018.0
19        2018.0
20        2018.0
21        2018.0
22        2018.0
23        2018.0
24        2018.0
25        2018.0
26        2018.0
27        2018.0
28        2018.0
29        2018.0
30        2018.0
31        2018.0
32        2018.0
33        2018.0
34        2018.0
35        2018.0
36        2018.0
37        2018.0
38        2018.0
39        2018.0
40        2018.0
41        2018.0
42        2018.0
43        2018.0
44        2018.0
45        2018.0
46        2018.0
47        2018.0
48        2018.0
49        2018.0
50        2018.0
51        2018.0
52        2018.0
53        2018.0
54        2018.0
55        2018.0
56        2018.0
57        2018.0
58        2018

## Load Instacart data

Let's return to the dataset of [3 Million Instacart Orders](https://tech.instacart.com/3-million-instacart-orders-open-sourced-d40d29ead6f2)

If necessary, uncomment and run the cells below to re-download and extract the data

In [0]:
# !wget https://s3.amazonaws.com/instacart-datasets/instacart_online_grocery_shopping_2017_05_01.tar.gz

In [0]:
# !tar --gunzip --extract --verbose --file=instacart_online_grocery_shopping_2017_05_01.tar.gz

Here's a list of the six CSV filenames

In [0]:
#%cd instacart_2017_05_01

In [0]:
#!ls -lh

#Load the CSV files with pandas

In [0]:
#

## Do feature engineering#

In [138]:
### stretch for the emp_title col
# There's one other column in the dataframe with percent signs. 
#Remove them and convert to floats. You'll need to handle missing values.
# Modify the emp_title column to replace titles with 'Other'
# if the title is not in the top 20.
# Take initiatve and work on your own ideas!
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term,emp_title_manager,issue_year,issue_month,days_from_earliest_credit_to_issue,loan_status_is_great,last_pymnt_d_month,last_pymnt_d_year
0,,,20000,20000,20000,60.0,17.97,507.55,D,D1,Motor Vehicle Operator,7 years,RENT,68000.0,Source Verified,2018-09-01,Current,n,,,debt_consolidation,Debt consolidation,254xx,WV,15.8,0,2009-02-01,1,30.0,118.0,11,1,13483,69.1%,26,w,19580.78,19580.78,975.17,975.17,419.22,555.95,0.0,0.0,0.0,2018-12-01,507.55,2019-01-01,2018-12-01,0,,1,Individual,,,,0,0,194908,0,2,0,2,17.0,45299,93.0,0,1,9815,86.0,19500,1,4,2,5,19491.0,2185.0,81.8,0,0,115.0,94,13,7,4,25.0,,2.0,30.0,0,1,3,2,4,11,8,10,3,11,0.0,0,0,1,92.3,50.0,1,0,205625,58782,12000,48733,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,,False,2018,9,3499,1,12.0,2018.0
1,,,25000,25000,25000,60.0,13.56,576.02,C,C1,Firefighter,7 years,MORTGAGE,61000.0,Verified,2018-09-01,Current,n,,,major_purchase,Major purchase,770xx,TX,16.74,0,2009-01-01,0,,,5,0,6299,48.5%,14,w,24109.45,24109.45,1567.98,1567.98,890.55,677.43,0.0,0.0,0.0,2018-12-01,576.02,2019-01-01,2018-12-01,0,,1,Individual,,,,0,0,204007,1,2,0,1,19.0,13862,37.0,0,0,0,40.0,13000,0,7,4,2,40801.0,1500.0,0.0,0,0,54.0,116,26,3,1,74.0,,4.0,,0,0,1,1,2,5,2,8,1,5,0.0,0,0,1,100.0,0.0,0,0,234573,20161,1500,37273,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,False,2018,9,3530,1,12.0,2018.0
2,,,30000,30000,30000,36.0,18.94,1098.78,D,D2,Unknown,< 1 year,RENT,100000.0,Source Verified,2018-09-01,Current,n,,,debt_consolidation,Debt consolidation,300xx,GA,16.07,0,2008-03-01,1,,114.0,6,1,14574,70.1%,9,w,28739.57,28739.57,2134.43,2134.43,1260.43,874.0,0.0,0.0,0.0,2018-12-01,1098.78,2019-01-01,2018-12-01,0,,1,Individual,,,,0,0,44048,1,2,1,1,1.0,29474,69.0,1,2,8521,69.0,20800,1,0,2,3,8810.0,5226.0,73.6,0,0,126.0,104,11,1,0,17.0,,1.0,,0,2,2,2,2,4,4,5,2,6,0.0,0,0,2,100.0,0.0,1,0,63636,44048,19800,42836,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,False,2018,9,3836,1,12.0,2018.0
3,,,6000,6000,6000,36.0,7.84,187.58,A,A4,Unknown,,RENT,30000.0,Not Verified,2018-09-01,Current,n,,,debt_consolidation,Debt consolidation,923xx,CA,5.44,0,2000-04-01,0,,104.0,8,1,5936,34.5%,11,w,5702.27,5702.27,369.93,369.93,297.73,72.2,0.0,0.0,0.0,2018-12-01,187.58,2019-01-01,2018-12-01,0,,1,Individual,,,,0,350,5936,0,0,0,1,23.0,0,,1,4,2913,35.0,17200,2,0,0,5,848.0,7698.0,35.9,0,0,139.0,221,7,7,0,7.0,,18.0,,0,2,3,4,4,2,8,9,3,8,0.0,0,0,1,100.0,33.3,1,0,17200,5936,12000,0,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,,False,2018,9,6727,1,12.0,2018.0
4,,,10650,10650,10650,36.0,7.84,332.95,A,A4,Unknown,,RENT,28000.0,Verified,2018-09-01,Current,n,,,medical,Medical expenses,430xx,OH,16.89,0,2002-11-01,0,,,3,0,37,0.3%,3,w,10121.54,10121.54,656.62,656.62,528.46,128.16,0.0,0.0,0.0,2018-12-01,332.95,2019-01-01,2018-12-01,0,,1,Joint App,43000.0,14.01,Source Verified,0,0,18254,0,1,0,1,16.0,18217,81.0,0,0,0,54.0,11500,1,1,0,1,6085.0,,,0,0,16.0,190,113,16,0,,,16.0,,0,0,1,0,0,1,2,2,2,3,0.0,0,0,0,100.0,,0,0,33876,18254,0,22376,2024.0,Oct-1996,0.0,0.0,8.0,18.7,1.0,9.0,0.0,0.0,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,False,2018,9,5783,1,12.0,2018.0


In [139]:
df['revol_util'].isnull().sum()

129

In [142]:
df['revol_util'].value_counts().head(20)

0%       1060
48%       220
37%       219
32%       217
39%       217
38%       215
35.9%     214
46%       211
36.1%     211
33.5%     208
30%       207
35%       207
35.2%     206
40.9%     206
34.4%     205
43%       205
47%       205
52%       205
56%       204
34.3%     204
Name: revol_util, dtype: int64

In [153]:
# def clean_nan(x):
#     if isinstance(x, np.nan):

# df['revol_util'].fillna(df['revol_util'].mean())
#too soon

# x = '68%'

# def remove_percent(x):
#     return float(x.strip('%'))


In [163]:
df['revol_util'] = df['revol_util'].apply(remove_percent)

AttributeError: 'float' object has no attribute 'strip'