In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn import tree

## Preprocessing data

In [11]:
def drop_columns(df, empty_columns):
  for column in empty_columns:
    df=df.drop([column], axis=1)  
  return df


In [12]:
df =  pd.read_csv('https://raw.githubusercontent.com/cjflanagan/cs68/master/lending_club_training_data_subset.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,...,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,65230,92451797,,26450.0,26450.0,26450.0,60 months,18.99%,685.99,D,...,,,,N,,,,,,
1,36521,94187830,,12000.0,12000.0,12000.0,36 months,11.44%,395.37,B,...,,,,N,,,,,,
2,25712,93882482,,1000.0,1000.0,1000.0,36 months,15.99%,35.16,C,...,,,,N,,,,,,


In [13]:
df.columns.values

array(['Unnamed: 0', 'id', 'member_id', 'loan_amnt', 'funded_amnt',
       'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade',
       'sub_grade', 'emp_title', 'emp_length', 'home_ownership',
       'annual_inc', 'verification_status', 'issue_d', 'loan_status',
       'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code',
       'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'fico_range_low', 'fico_range_high', 'inq_last_6mths',
       'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d',
       'last_credit_pull_d', 'last_fico_range_high',
       'last_fico_range_low', 'collections_12_mths_ex_med',
       'mths_since_last_maj

In [14]:
df.loan_status = df.loan_status.replace({'Charged Off':1, 'Fully Paid':0})
y = df.pop('loan_status')

In [15]:
df.shape

(20000, 150)

In [16]:
from pandas.core.common import is_true_slices
# deleting columns with ~100% missing data
empty_columns=((df
 .isna()
 .mean()
 *100)
 .pipe(lambda ser:ser[ser>95])
).reset_index()
empty_columns=empty_columns['index'].values.tolist()
print('The following columns were > 95% empty and therefore eliminatd: ', empty_columns)
df=drop_columns(df, empty_columns)

The following columns were > 95% empty and therefore eliminatd:  ['member_id', 'desc', 'next_pymnt_d', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'revol_bal_joint', 'sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_earliest_cr_line', 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il', 'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog', 'hardship_type', 'hardship_reason', 'hardship_status', 'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date', 'hardship_length', 'hardship_dpd', 'hardship_loan_status', 'orig_projected_additional_accrued_interest', 'hardship_payoff_balance_amount', 'hardship_last_payment_amount']


In [17]:
#numerical columns
numerical_cols = df.columns[df.dtypes=='int64'].tolist()
numerical_cols

['Unnamed: 0', 'id']

In [18]:
#categorical column
categorical_cols = df.columns[df.dtypes=='object'].tolist()
categorical_cols

['term',
 'int_rate',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'verification_status',
 'issue_d',
 'pymnt_plan',
 'url',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'earliest_cr_line',
 'revol_util',
 'initial_list_status',
 'last_pymnt_d',
 'last_credit_pull_d',
 'application_type',
 'hardship_flag',
 'debt_settlement_flag',
 'debt_settlement_flag_date',
 'settlement_status',
 'settlement_date']

In [19]:
#multi_valued
multi_valued_colmmns = df[categorical_cols].nunique()[df[categorical_cols].nunique() > 2].index.tolist()
multi_valued_colmmns

['int_rate',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'verification_status',
 'issue_d',
 'url',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'earliest_cr_line',
 'revol_util',
 'last_pymnt_d',
 'last_credit_pull_d',
 'debt_settlement_flag_date',
 'settlement_status',
 'settlement_date']

In [20]:
#binary
binary_colmmns = df[categorical_cols].nunique()[df[categorical_cols].nunique() == 2].index.tolist()
binary_colmmns

['term',
 'initial_list_status',
 'application_type',
 'hardship_flag',
 'debt_settlement_flag']

In [21]:
#Single value
single_valued_colmmns = df[categorical_cols].nunique()[df[categorical_cols].nunique() == 1].index.tolist()
single_valued_colmmns

['pymnt_plan']

##Data Cleaning

In [22]:
#drop columns for model
columns_to_drop=['Unnamed: 0', 'id','pymnt_plan','last_pymnt_d' ]
if len(columns_to_drop)==0:
  pass
else: 
  for item in columns_to_drop:
    if item in multi_valued_colmmns: multi_valued_colmmns.remove(item)
    if item in categorical_cols: categorical_cols.remove(item)

  df=drop_columns(df,columns_to_drop)

In [24]:
# # Transform binary_columns
# for column in binary_colmmns :
#   df[column] = LabelEncoder().fit_transform(df[column])

In [25]:
# Transform multivalued columns
columns_to_skip=[]

for item in columns_to_skip:
  if item in multi_valued_colmmns: multi_valued_colmmns.remove(item)
  print(multi_valued_colmmns)

In [26]:
#transforming columns before changing data types
df.revol_util=(df.revol_util
 .str.slice(0,-1)
)

In [27]:
dic_values={'NaN':-1, '< 1 year': 0.1,'1 year': 1,'2 years': 2, '3 years':3, '4 years':4, '5 years':5, '6 years':6, '7 years':7, '8 years':8, '9 years':9, '10+ years':10}
df.emp_length.replace(dic_values).value_counts()


10.0    6633
2.0     1910
3.0     1623
0.1     1449
1.0     1331
5.0     1241
4.0     1199
6.0      933
8.0      761
9.0      743
7.0      641
Name: emp_length, dtype: int64

In [28]:
#change types 

#to float
dic={'dti':float, 'all_util': float}
for key,value in dic.items():
  df[key].astype(value)


#to date
columns_to_date=['settlement_date', 'settlement_date'] 
#, 'debt_settlement_flag_date','last_pymnt_d','debt_settlement_flag_date'
for column in columns_to_date:
  df[column] = pd.to_datetime(df[column], errors='coerce')

with pd.option_context('display.max_columns', 2, 'display.min_rows',None): display(df.dtypes)


loan_amnt                            float64
funded_amnt                          float64
funded_amnt_inv                      float64
term                                   int64
int_rate                              object
installment                          float64
grade                                 object
sub_grade                             object
emp_title                             object
emp_length                            object
home_ownership                        object
annual_inc                           float64
verification_status                   object
issue_d                               object
url                                   object
purpose                               object
title                                 object
zip_code                              object
addr_state                            object
dti                                  float64
delinq_2yrs                          float64
earliest_cr_line                      object
fico_range

In [29]:
def fill_na_columns(df, column_name, new_na_value):
  df[column_name]=(df[column_name]
                  .fillna(new_na_value)
                  )
def drop_na_columns(df, column_name):
  df[column_name]=(df[column_name]
                .dropna
                )

#replace values
fill_na_columns(df,'il_util',0)

fill_na_columns(df,'mths_since_last_delinq',-1)
fill_na_columns(df,'mths_since_last_record',-1)

fill_na_columns(df,'settlement_amount',-1)
fill_na_columns(df,'num_tl_120dpd_2m',-1)

fill_na_columns(df,'emp_title', 'Unknown')
fill_na_columns(df,'title', 'Unknown')
fill_na_columns(df,'settlement_status', 'Unknown')
fill_na_columns(df,'hardship_flag', 'Unknown')

#fill with mean
columns=['dti', 'mths_since_last_major_derog', 'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct','mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'settlement_amount', 'settlement_term', 'settlement_percentage','bc_open_to_buy'] 
for column in columns:
  df[column].fillna(df[column].mean())

#drop na
drop_na_columns(df,'emp_length')

#drop column
columns_to_drop=['revol_util']
df=drop_columns(df, columns_to_drop)


In [30]:
#Identifying problematic columns
columns_to_study=(
    (df
    .isna()
    .mean()
    *100
    )
    .pipe(lambda ser:ser[ser>0]).reset_index()
  ) .iloc[:,0]
columns_to_study

0                                dti
1        mths_since_last_major_derog
2                 mths_since_rcnt_il
3                           all_util
4                     bc_open_to_buy
5                            bc_util
6                 mo_sin_old_il_acct
7               mths_since_recent_bc
8           mths_since_recent_bc_dlq
9              mths_since_recent_inq
10    mths_since_recent_revol_delinq
11                  percent_bc_gt_75
12         debt_settlement_flag_date
13                   settlement_date
14             settlement_percentage
15                   settlement_term
Name: index, dtype: object

In [31]:
#categorical column
categorical_cols = df.columns[df.dtypes=='object'].tolist()
categorical_cols

['int_rate',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'verification_status',
 'issue_d',
 'url',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'earliest_cr_line',
 'last_credit_pull_d',
 'hardship_flag',
 'debt_settlement_flag',
 'debt_settlement_flag_date',
 'settlement_status']

In [32]:
def limit_values(val, list_val):
    if val in list_val:
        return val
    return 'Another'

## Decising on the Data

This data has so many columns, variables and it is SO dirty, that I am decising following Charlie's model and making an uneducated guess on the variables that will go into the model.  

In [33]:
#curated categorical columns
categorical_cols = [

#from teacher's code to debug                    
#  'term',
#  'grade',
#  'sub_grade',
#  'home_ownership',
#  'verification_status',
#  'issue_d'

 'grade',
 'sub_grade',
 'emp_title',
 'home_ownership',
 'verification_status',
 'issue_d',
 'purpose',
 'title',
 'settlement_status', 
 'term'
 ]
categorical_cols

['grade',
 'sub_grade',
 'emp_title',
 'home_ownership',
 'verification_status',
 'issue_d',
 'purpose',
 'title',
 'settlement_status',
 'term']

In [34]:
df2=df[categorical_cols]
df2.head(3)
df_dummies = pd.get_dummies(data = df2, drop_first=False)
df_dummies

Unnamed: 0,term,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,sub_grade_A1,sub_grade_A2,...,title_Major purchase,title_Medical expenses,title_Moving and relocation,title_Other,title_Unknown,title_Vacation,settlement_status_ACTIVE,settlement_status_BROKEN,settlement_status_COMPLETE,settlement_status_Unknown
0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
19996,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
19997,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
19998,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [35]:
numerical_columns = [
                     
#from teacher's code to debug
  # 'loan_amnt',
  # 'installment',
  # 'delinq_2yrs',
  # 'fico_range_low',
  # 'fico_range_high',
  # 'total_acc'

  'acc_open_past_24mths',
  'annual_inc', 
  'avg_cur_bal',
  'delinq_2yrs',
  'fico_range_low',
  'fico_range_high',
  'funded_amnt_inv',
  'funded_amnt',
  'funded_amnt',
  'installment', 
  'inq_last_6mths',
  'loan_amnt',
  'num_accts_ever_120_pd',
  'num_actv_bc_tl',
  'num_actv_rev_tl',
  'num_bc_sats',
  'num_bc_tl',
  'num_il_tl',
  'num_op_rev_tl',
  'num_rev_accts',
  'num_rev_tl_bal_gt_0',
  'num_sats',
  'num_tl_op_past_12m',
  'open_acc_6m',
  'open_act_il',
  'open_acc',
  'total_pymnt',
  'total_pymnt_inv',
  'total_rec_prncp',
  'total_rec_int',
  'tot_cur_bal',
  'total_acc',
  'tot_hi_cred_lim',
  'total_bal_ex_mort',
  'total_bc_limit',
  'total_il_high_credit_limit',
  'revol_bal',
  #these gave me errors
  # 'dti'
  # 'pct_tl_nvr_dlq',
  # 'percent_bc_gt_75',
  # 'pub_rec_bankruptcies',
]
numerical_columns

['acc_open_past_24mths',
 'annual_inc',
 'avg_cur_bal',
 'delinq_2yrs',
 'fico_range_low',
 'fico_range_high',
 'funded_amnt_inv',
 'funded_amnt',
 'funded_amnt',
 'installment',
 'inq_last_6mths',
 'loan_amnt',
 'num_accts_ever_120_pd',
 'num_actv_bc_tl',
 'num_actv_rev_tl',
 'num_bc_sats',
 'num_bc_tl',
 'num_il_tl',
 'num_op_rev_tl',
 'num_rev_accts',
 'num_rev_tl_bal_gt_0',
 'num_sats',
 'num_tl_op_past_12m',
 'open_acc_6m',
 'open_act_il',
 'open_acc',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'tot_cur_bal',
 'total_acc',
 'tot_hi_cred_lim',
 'total_bal_ex_mort',
 'total_bc_limit',
 'total_il_high_credit_limit',
 'revol_bal']

In [36]:
df_model = pd.concat([df[numerical_columns], df_dummies], axis=1)
df_model.head()

Unnamed: 0,acc_open_past_24mths,annual_inc,avg_cur_bal,delinq_2yrs,fico_range_low,fico_range_high,funded_amnt_inv,funded_amnt,funded_amnt.1,installment,...,title_Major purchase,title_Medical expenses,title_Moving and relocation,title_Other,title_Unknown,title_Vacation,settlement_status_ACTIVE,settlement_status_BROKEN,settlement_status_COMPLETE,settlement_status_Unknown
0,3.0,63000.0,1026.0,2.0,675.0,679.0,26450.0,26450.0,26450.0,685.99,...,0,0,0,0,0,0,0,0,0,1
1,5.0,85000.0,3869.0,0.0,680.0,684.0,12000.0,12000.0,12000.0,395.37,...,0,0,0,0,0,0,0,0,0,1
2,2.0,90000.0,4836.0,0.0,695.0,699.0,1000.0,1000.0,1000.0,35.16,...,0,0,0,1,0,0,0,0,0,1
3,5.0,40000.0,2895.0,0.0,680.0,684.0,10000.0,10000.0,10000.0,327.34,...,0,0,0,0,0,0,0,0,0,1
4,4.0,30000.0,7191.0,0.0,660.0,664.0,4500.0,4500.0,4500.0,181.89,...,0,0,0,1,0,0,0,0,0,1


In [37]:
#piece of code to review columns and help decide if we should add them to the model

# columns_to_skip=['emp_length']
# columns_to_study=df.columns.values
# for column in columns_to_study:
#   if column not in columns_to_skip and column not in df_model.columns.values:
#     print(column, type(column))
#     print('Column "', column, '" has ', df[column].isnull().sum(), ' nan values')
#     print(df[column].value_counts(dropna=False))
#     print(' ')
#     print('------------------------------------------- ')

In [38]:
X = df_model

## Splitting data

In [39]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size = 0.33)

In [40]:
print('Train fraction: {}'.format((X_train.shape[0]) / X.shape[0]))
print('Validation fraction: {}'.format((X_valid.shape[0]  / X.shape[0])))
print('Test fraction: {}'.format((X_test.shape[0]  / X.shape[0])))

Train fraction: 0.7
Validation fraction: 0.201
Test fraction: 0.099


In [41]:
X_train.head()

Unnamed: 0,acc_open_past_24mths,annual_inc,avg_cur_bal,delinq_2yrs,fico_range_low,fico_range_high,funded_amnt_inv,funded_amnt,funded_amnt.1,installment,...,title_Major purchase,title_Medical expenses,title_Moving and relocation,title_Other,title_Unknown,title_Vacation,settlement_status_ACTIVE,settlement_status_BROKEN,settlement_status_COMPLETE,settlement_status_Unknown
3615,2.0,30780.0,1390.0,0.0,690.0,694.0,7000.0,7000.0,7000.0,217.72,...,0,0,0,0,0,0,0,0,1,0
1273,13.0,90000.0,12080.0,0.0,670.0,674.0,2000.0,2000.0,2000.0,67.14,...,1,0,0,0,0,0,0,0,0,1
2414,4.0,110000.0,4377.0,0.0,675.0,679.0,24000.0,24000.0,24000.0,867.54,...,0,0,0,0,0,0,0,0,0,1
683,5.0,108000.0,3398.0,1.0,665.0,669.0,2800.0,2800.0,2800.0,99.82,...,0,0,0,1,0,0,0,0,0,1
12695,3.0,80000.0,8604.0,2.0,665.0,669.0,35000.0,35000.0,35000.0,1037.38,...,0,0,0,0,0,0,0,0,1,0


## Decision Tree

In [42]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(max_depth=3)

print(dt_model)

dt_model = dt_model.fit(X_train,y_train)
pred_dt = dt_model.predict_proba(X_valid)[:, 1]

DecisionTreeClassifier(max_depth=3)


In [43]:
from six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(dt_model, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = X_train.columns.values.tolist(), 
               class_names=['Paid', 'Charged Off'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

ModuleNotFoundError: No module named 'pydotplus'

In [None]:
from sklearn.metrics import classification_report
pred_dt_binary = dt_model.predict(X_valid)
print(classification_report(y_valid, pred_dt_binary))

In [None]:
# calculate scores
auc_dt = roc_auc_score(y_valid, pred_dt)

# calculate roc curves
fpr_dt, tpr_dt, _ = roc_curve(y_valid, pred_dt)

plt.figure(figsize=(15, 10))
# plot horizontal line 
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr_dt, tpr_dt, label='DT (AUC = %0.2f)' % auc_dt)

# axis labels
plt.xlabel('FPR')
plt.ylabel('TPR')
# show the legend
plt.legend(loc='lower right')
# show the plot
plt.show()

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
print(rf_model)

rf_model = rf_model.fit(X_train, y_train)
pred_rf = rf_model.predict_proba(X_valid)[:, 1]

In [None]:
print(classification_report(y_valid, pred_rf.round(0)))

## XGBoost

In [None]:
#code to fix error taken from: https://stackoverflow.com/questions/43579180/feature-names-must-be-unique-xgboost
X_train = X_train.loc[:,~X_train.columns.duplicated()]
X_valid = X_valid.loc[:,~X_valid.columns.duplicated()]

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier()

xgb_model = xgb_model.fit(X_train, y_train)
pred_xgb = xgb_model.predict_proba(X_valid)[:, 1]

## Evaluating Model

In [None]:
def create_roc_plot(name, predictions):
  auc = roc_auc_score(y_valid, predictions).round(2)
  fpr, tpr, _ = roc_curve(y_valid, predictions)

  plt.figure(figsize=(5, 3))
  plt.plot([0, 1], [0, 1], linestyle='--')  # plot horizontal line 
  plt.plot(fpr, tpr, label='{} AUC = {}'.format(name, auc)) # plot the roc curve for the model
  plt.xlabel('FPR')
  plt.ylabel('TPR')
  plt.legend(loc='lower right')  # show the legend
  plt.show() # show the plot
  return None

In [None]:
create_roc_plot('Decision Tree', pred_dt)
create_roc_plot('Random Forest', pred_rf)
create_roc_plot('XGBoost', pred_xgb)

In [None]:
# calculate scores
auc_dt = roc_auc_score(y_valid, pred_dt)
auc_rf = roc_auc_score(y_valid, pred_rf)
auc_xgb = roc_auc_score(y_valid, pred_xgb)

# calculate roc curves
fpr_dt, tpr_dt, _ = roc_curve(y_valid, pred_dt)
fpr_rf, tpr_rf, _ = roc_curve(y_valid, pred_rf)
fpr_xgb, tpr_xgb, _ = roc_curve(y_valid, pred_xgb)

plt.figure(figsize=(10, 8))
# plot horizontal line 
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr_dt, tpr_dt, label='DT (AUC = %0.2f)' % auc_dt)
plt.plot(fpr_rf, tpr_rf, label='RF (AUC = %0.2f)' % auc_rf)
plt.plot(fpr_xgb, tpr_xgb, label='XGB (AUC = %0.2f)' % auc_xgb)

# axis labels
plt.xlabel('FPR')
plt.ylabel('TPR')
# show the legend
plt.legend(loc='lower right')
# show the plot
plt.show()

## Drafts code on cleaning (skip)

In [None]:
column='bc_open_to_buy'
df[column].fillna(df[column].mean())

In [None]:
columns_to_skip=['dti', 'all_util']
columns_date=['mths_since_last_major_derog', 'mths_since']

columns_witn_na=[]
for column in columns_to_study:
  if column in columns_to_skip or column in columns_date:
    pass
  elif (df[column].isnull().sum()*-1+df.shape[0])!=0:
      print(column, type(column))
      print(df[column].value_counts(dropna=False))
      print(' ')
      print('------------------------------------------- ')

In [None]:

def nan_values_in_column(df,column):
  empty=df[column].isnull().sum()*-1+df.shape[0]
  print('Column "', column, '" has ', empty, ' nan values')

In [None]:
#Code to zoom in on a particular column
column='dti'

nan_values_in_column(df, column)
df[column].describe()
df[column].value_counts(dropna=False)

In [None]:
def limit_values(val, list_val):
    if val in list_val:
        return val
    return 'Another'

In [None]:
#changing type date

# pd.to_datetime(df['settlement_date'],format='%b-%Y')

# print(type(df['settlement_date'][1]))

# df2=(
#     df['settlement_date'].value_counts(dropna=True)
#     ).reset_index()
    
# df2=df2.iloc[:,0]
# pd.to_datetime(df2, format='%b-%Y')
# df2.value_counts()