In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.simplefilter('ignore')

- Submission_1 : Single XGB with random state 0 with SMOTE
- Submission_2 : Combination of 5 XGB and 5 Random Forest with Random state 0 with SMOTE
- Submission_3 : Combination of 5 Random Forest with Random state 0 with SMOTE
- Submission_4 : Submission 3 with Annual Income Dropped
- Submission_5 : Submission 2 with Annual Income Dropped
- Submission_6 : Submission 3 with Annual Income, Stay Duration of Current Residence and Tenure of Oldest Credit Line Dropped and random states 10 everywhere

In [2]:
train = pd.read_csv('TrainingData.csv')
test = pd.read_csv('testX.csv')

df = pd.concat([train, test], axis = 0)

df.shape

(130000, 53)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130000 entries, 0 to 46999
Data columns (total 53 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   application_key  130000 non-null  int64  
 1   mvar1            130000 non-null  object 
 2   mvar2            120787 non-null  float64
 3   mvar3            129171 non-null  float64
 4   mvar4            129171 non-null  float64
 5   mvar5            129171 non-null  float64
 6   mvar6            130000 non-null  object 
 7   mvar7            130000 non-null  object 
 8   mvar8            130000 non-null  object 
 9   mvar9            130000 non-null  object 
 10  mvar10           130000 non-null  object 
 11  mvar11           130000 non-null  object 
 12  mvar12           130000 non-null  object 
 13  mvar13           130000 non-null  object 
 14  mvar14           130000 non-null  int64  
 15  mvar15           130000 non-null  object 
 16  mvar16           130000 non-null  objec

In [4]:
columns = ['ID',
           'Credit Worthiness Score',
           'Number and Riskiness of Credit Enquiry',
           'Default Severity (Any Loan)',
           'Default Severity (Auto Loan)',
           'Default Severity (Education Loan)',
           'Min Cred Available (Revolving Credit Cards)',
           'Max Cred Available (Active Credit Lines)',
           'Max Cred Available (Active Revolving Credit Cards)',
           'Total Available Credit (Cards with 1 missed payment)',
           'Total Credit Available',
           'Dues Collected Post Default',
           'Total Amount Due (Active Cards)',
           'Credit Amount Paid (Prev Year)',
           'Annual Income',
           'Property Value',
           'Full Credit Utilisation (Revolving Credit Cards)',
           'Full Credit Utilisation (Credit Cards)',
           'Full Credit Utilisation (Credit Lines)',
           '>75 percent Credit Utilisation (Credit Cards)',
           '>75 percent Credit Utilisation (Credit Lines)',
           'Average Utilisation (Revolving Credit Cards)',
           'Average Utilisation Last 2 Years (All Credit Lines)',
           'Average Utilisation Last Year (All Credit Cards)',
           'Average Utilisation Last 6 Months with 1 Missed Payment (Credit Cards)',
           'Average Tenure (Active Revolving Credit Cards)',
           'Tenure of Oldest Active Credit Card',
           'Tenure of Oldest Active Revolving Credit Card',
           'Days Since Last Missed Payment',
           'Tenure of Oldest Credit Line',
           'Max Tenure (Auto Loans)',
           'Max Tenure (Education Loans)',
           'Sum of Tenures (Active Credit Cards)',
           'Stay Duration of Current Residence',
           'Credit Lines with 1 Missed Payment (Last 6 Months)',
           'Revolving Credit Cards with 1 Missed Payment (Last 2 Years)',
           'Active Credit Lines',
           'Credit Cards with >2Y Tenure',
           'Credit Lines activated Prev 2 Years',
           'Credit Lines with Current Delinquency',
           'Utilization on Active Education Loans',
           'Utilization on Active Auto Loans',
           'Financial Stress Index',
           'Credit Lines with No Missed Payments (Prev 2 Years)',
           'Ratio(Max Amount Due : Sum of Amounts Due)',
           'Mortgage Loans (With 2 Missed Payements)',
           'Auto Loans (With 2 Missed Payments)',
           'Type of Product',
           'Int Value for an Application',
           'Bucketized Credit Worthiness Score',
           'Compound of mvar49 and mvar48',
           'mvar51',
           'default_ind']

In [5]:
df.columns = columns

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130000 entries, 0 to 46999
Data columns (total 53 columns):
 #   Column                                                                  Non-Null Count   Dtype  
---  ------                                                                  --------------   -----  
 0   ID                                                                      130000 non-null  int64  
 1   Credit Worthiness Score                                                 130000 non-null  object 
 2   Number and Riskiness of Credit Enquiry                                  120787 non-null  float64
 3   Default Severity (Any Loan)                                             129171 non-null  float64
 4   Default Severity (Auto Loan)                                            129171 non-null  float64
 5   Default Severity (Education Loan)                                       129171 non-null  float64
 6   Min Cred Available (Revolving Credit Cards)                          

### Feature Understanding (Explanations and Stuff to Look into)

- Auto Loans: Loans where Vehicles are kept as collateral
- Revolving Credit: You have a fixed amount that you can borrow in some fixed time frame. After that the amount you can borrow replenishes to the full amount. Although you will have a maximum limit. For example, you can borrow 100 rupees a month max and the overall limit on that credit line is 500 rupees. So if your overall borrowed amount is 460 rupees and for that month you havenn't borrowed anything so far, you will be able to borrow only 40 rupees even though the monthly limit is 100
- Difference between Credit Worthiness Score and its bucketized version?
- Difference in statistics between train and test (Need for Adverserial Validation?)

### Feature Ideas and Experiments
- Binary Variable for checking if the person has moved into current residence after issuing any credit lines
- Credit Amount Paid / Number of Credit Lines with No Missed Payments
- Max Cred Available vs Default Severity
- Separate modeling for 0 Default Severity vs Non-Zero Default Severity
- KNN Missing Values Fill In
- Property Value vs Annual Income (Some Indicator of whether or not a person spends too much for their Income)
- 75% utilisation / Full Utilisation

In [7]:
df.head(10)

Unnamed: 0,ID,Credit Worthiness Score,Number and Riskiness of Credit Enquiry,Default Severity (Any Loan),Default Severity (Auto Loan),Default Severity (Education Loan),Min Cred Available (Revolving Credit Cards),Max Cred Available (Active Credit Lines),Max Cred Available (Active Revolving Credit Cards),Total Available Credit (Cards with 1 missed payment),Total Credit Available,Dues Collected Post Default,Total Amount Due (Active Cards),Credit Amount Paid (Prev Year),Annual Income,Property Value,Full Credit Utilisation (Revolving Credit Cards),Full Credit Utilisation (Credit Cards),Full Credit Utilisation (Credit Lines),>75 percent Credit Utilisation (Credit Cards),>75 percent Credit Utilisation (Credit Lines),Average Utilisation (Revolving Credit Cards),Average Utilisation Last 2 Years (All Credit Lines),Average Utilisation Last Year (All Credit Cards),Average Utilisation Last 6 Months with 1 Missed Payment (Credit Cards),Average Tenure (Active Revolving Credit Cards),Tenure of Oldest Active Credit Card,Tenure of Oldest Active Revolving Credit Card,Days Since Last Missed Payment,Tenure of Oldest Credit Line,Max Tenure (Auto Loans),Max Tenure (Education Loans),Sum of Tenures (Active Credit Cards),Stay Duration of Current Residence,Credit Lines with 1 Missed Payment (Last 6 Months),Revolving Credit Cards with 1 Missed Payment (Last 2 Years),Active Credit Lines,Credit Cards with >2Y Tenure,Credit Lines activated Prev 2 Years,Credit Lines with Current Delinquency,Utilization on Active Education Loans,Utilization on Active Auto Loans,Financial Stress Index,Credit Lines with No Missed Payments (Prev 2 Years),Ratio(Max Amount Due : Sum of Amounts Due),Mortgage Loans (With 2 Missed Payements),Auto Loans (With 2 Missed Payments),Type of Product,Int Value for an Application,Bucketized Credit Worthiness Score,Compound of mvar49 and mvar48,mvar51,default_ind
0,230032,1696,1.6541,0.0,0.0,0.0,0,6015,322,40369,18414,missing,6423,3067,123875,missing,1,1,1,3,3,94.78,8987.18,,72.25,1462,4532,2890,61,4532,1095,3376,625,1.1667,1,0,8,10,4,1,73.78,82.547,0.08696,10,0.63899,na,0,C,10,770,4,3080,0.0
1,230033,1846,0.8095,0.0,0.0,0.0,102,7532,3171,18234,13664,missing,765,1931,42613,missing,0,0,0,0,0,74.25,953.06,953.06,4.8,1028,2099,2099,30386,2281,missing,2251,169,0.4167,0,0,8,0,2,0,99.129,missing,0,13,0.63836,na,na,L,732,437,5,2185,1.0
2,230034,1745,0.4001,0.0,0.0,0.0,missing,2536,missing,missing,2536,missing,missing,missing,76109,missing,na,na,na,0,0,,,,,missing,missing,missing,669,4623,3772,missing,missing,25.0833,0,na,1,na,1,0,missing,29.29,0,1,1.0,na,0,C,89,795,4,3180,1.0
3,230035,1739,0.2193,0.0,0.0,0.0,1982,26440,4955,20316,37013,missing,0,0,84235,missing,0,0,0,0,0,,0.0,,0.0,1308,2525,791,91,5992,missing,3741,215,10.3333,0,0,3,3,2,0,96.272,missing,0.15385,3,0.53241,0,0,L,3,755,4,3020,0.0
4,230036,1787,0.0118,0.225,0.0,0.0,5451,5494,5494,7987,4696,missing,2257,27815,123875,524848,0,0,0,0,0,20.51,796.67,,28.72,801,2281,2281,487,2707,missing,1947,158,0.9167,0,0,2,3,2,0,115.019,missing,0,1,0.92665,na,na,L,5,425,4,1700,0.0
5,230037,1579,,3.502,0.0,0.0,missing,missing,missing,missing,0,199,missing,missing,77298,missing,na,na,na,0,0,,,,,missing,missing,missing,183,973,missing,missing,missing,1.3333,0,na,2,na,2,na,missing,missing,1.5,0,,na,na,C,35,455,4,1820,1.0
6,230038,1818,0.4001,0.0,0.0,0.0,missing,1088,missing,1536,1498,missing,448,2331,123875,missing,na,0,0,0,0,,,,29.16,1034,760,missing,30386,3437,missing,791,34,9.3333,0,na,2,0,0,0,88.171,missing,0,2,0.87224,na,0,C,2,392,5,1960,1.0
7,230039,na,,,,,missing,missing,missing,missing,missing,missing,missing,missing,198200,1263525,na,na,na,0,na,,,,,missing,missing,missing,missing,missing,missing,missing,missing,0.0833,na,na,na,na,na,na,missing,missing,missing,na,,na,na,C,2,615,#VALUE!,#VALUE!,0.0
8,230040,1836,0.1358,0.0,0.0,0.0,347,38964,17828,70729,65843,missing,16298,18877,109010,missing,0,0,0,1,1,32.02,0.0,0.0,32.92,3180,8943,8943,1490,12075,missing,missing,1673,5.9167,0,na,6,12,2,0,missing,missing,0,10,0.89868,0,0,L,5,821,5,4105,1.0
9,230041,1839,0.1981,0.0,0.0,0.0,793,6131,6045,48959,31640,missing,10565,14282,99100,146668,0,0,0,1,1,46.17,,,49.07,1701,8912,2707,61,11193,4593,missing,615,1.75,0,na,10,7,0,0,missing,45.59,0.08824,14,0.33834,na,0,L,3247,408,5,2040,0.0


In [8]:
df.replace('missing', np.nan, inplace = True)
df.replace('na', np.nan, inplace = True)
df.replace('#VALUE!', np.nan, inplace = True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130000 entries, 0 to 46999
Data columns (total 53 columns):
 #   Column                                                                  Non-Null Count   Dtype  
---  ------                                                                  --------------   -----  
 0   ID                                                                      130000 non-null  int64  
 1   Credit Worthiness Score                                                 124039 non-null  object 
 2   Number and Riskiness of Credit Enquiry                                  120787 non-null  float64
 3   Default Severity (Any Loan)                                             129171 non-null  float64
 4   Default Severity (Auto Loan)                                            129171 non-null  float64
 5   Default Severity (Education Loan)                                       129171 non-null  float64
 6   Min Cred Available (Revolving Credit Cards)                          

In [10]:
df.isnull().sum() * 100 / len(df)

ID                                                                         0.000000
Credit Worthiness Score                                                    4.585385
Number and Riskiness of Credit Enquiry                                     7.086923
Default Severity (Any Loan)                                                0.637692
Default Severity (Auto Loan)                                               0.637692
Default Severity (Education Loan)                                          0.637692
Min Cred Available (Revolving Credit Cards)                               23.844615
Max Cred Available (Active Credit Lines)                                   9.348462
Max Cred Available (Active Revolving Credit Cards)                        23.854615
Total Available Credit (Cards with 1 missed payment)                      14.226154
Total Credit Available                                                     0.638462
Dues Collected Post Default                                               56

In [11]:
df['Type of Product'].value_counts()

C    82535
L    47465
Name: Type of Product, dtype: int64

In [12]:
df.reset_index(drop = True, inplace = True)

In [13]:
def convert_to_int(entry):
    return float(entry)

for col in df.columns:
    if col not in ['Type of Product', 'default_ind']:
        df[col] = df[col].apply(convert_to_int)
    
df['Credit Worthiness Score'][0]

1696.0

In [14]:
df.fillna(-1, inplace = True)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130000 entries, 0 to 129999
Data columns (total 53 columns):
 #   Column                                                                  Non-Null Count   Dtype  
---  ------                                                                  --------------   -----  
 0   ID                                                                      130000 non-null  float64
 1   Credit Worthiness Score                                                 130000 non-null  float64
 2   Number and Riskiness of Credit Enquiry                                  130000 non-null  float64
 3   Default Severity (Any Loan)                                             130000 non-null  float64
 4   Default Severity (Auto Loan)                                            130000 non-null  float64
 5   Default Severity (Education Loan)                                       130000 non-null  float64
 6   Min Cred Available (Revolving Credit Cards)                         

In [16]:
df['default_ind'].value_counts()

 0.0    59145
-1.0    47000
 1.0    23855
Name: default_ind, dtype: int64

In [17]:
le = LabelEncoder()
df['Type of Product'] = le.fit_transform(df['Type of Product'])

In [18]:
import re
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

, 'AnnualIncome', 'StayDurationofCurrentResidence', 'TenureofOldestCreditLine',
             'NumberandRiskinessofCreditEnquiry', 'CreditAmountPaidPrevYear', 'BucketizedCreditWorthinessScore',
             'mvar51', 'MaxTenureEducationLoans', 'PropertyValue'

In [19]:
train = df.iloc[:83000, :]

drop_cols = ['ID', 'default_ind', 'TypeofProduct']

X_data1 = train[train['TypeofProduct'] == 0].drop(drop_cols, axis = 1)
y_data1 = train[train['TypeofProduct'] == 0]['default_ind']

X_data2 = train[train['TypeofProduct'] == 1].drop(drop_cols, axis = 1)
y_data2 = train[train['TypeofProduct'] == 1]['default_ind']

In [20]:
train['TypeofProduct'].value_counts()

0    52043
1    30957
Name: TypeofProduct, dtype: int64

In [21]:
train[train['TypeofProduct'] == 0]['default_ind'].value_counts()

0.0    33217
1.0    18826
Name: default_ind, dtype: int64

In [22]:
train[train['TypeofProduct'] == 1]['default_ind'].value_counts()

0.0    25928
1.0     5029
Name: default_ind, dtype: int64

Best Parameters: {'n_estimators': 180, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 32, 'criterion': 'gini'}

In [23]:
kfold = StratifiedKFold(random_state = 10, n_splits = 5, shuffle = True)
splits1 = kfold.split(X_data1, y_data1)

sm = SMOTE(random_state = 10, k_neighbors = 7, n_jobs = -1)

cat1_acc_scores = []
cat1_f1_scores = []
cat1_models = []
    
print("======== Training Model for Split 1 ========")

i=1

for Train, Test in splits1:
    X_Train, X_Test, Y_Train, Y_Test = X_data1.iloc[Train], X_data1.iloc[Test], y_data1.iloc[Train], y_data1.iloc[Test]
    
    cat1 = CatBoostClassifier(n_estimators = 10000, max_depth = 8, eval_metric = 'CrossEntropy',
                             bootstrap_type='Bernoulli',
                        learning_rate = 0.01, random_state = 0, early_stopping_rounds = 1000,
                        scale_pos_weight = 33217/18826)
    
    #X_Train, Y_Train = sm.fit_resample(X_Train, Y_Train)
    
    cat1.fit(X_Train, Y_Train, eval_set = [(X_Train, Y_Train), (X_Test, Y_Test)], verbose = False)
    
    pred = cat1.predict(X_Test)
    
    cat1_f1_scores.append(f1_score(Y_Test, pred))
    cat1_acc_scores.append(accuracy_score(Y_Test, pred)*100)
    cat1_models.append(cat1)
    
    
    print("Iteration {} Completed".format(i))
    i+=1

Iteration 1 Completed
Iteration 2 Completed
Iteration 3 Completed
Iteration 4 Completed
Iteration 5 Completed


In [30]:
splits2 = kfold.split(X_data2, y_data2)

sm = SMOTE(random_state = 10, k_neighbors = 7, n_jobs = -1)

cat2_acc_scores = []
cat2_f1_scores = []
cat2_models = []

print("======== Training Model for Split 2 ========")
i=1

for Train, Test in splits2:
    X_Train, X_Test, Y_Train, Y_Test = X_data2.iloc[Train], X_data2.iloc[Test], y_data2.iloc[Train], y_data2.iloc[Test]
    
    cat2 = CatBoostClassifier(n_estimators = 10000, max_depth = 8, eval_metric = 'CrossEntropy',
                              bootstrap_type='Bernoulli',
                              learning_rate = 0.01, random_state = 0, early_stopping_rounds = 1000,
                        scale_pos_weight = 25928/5029)
    #cat2 = LGBMClassifier(n_estimators = 10000, max_depth = 8, n_jobs = -1,
    #                    learning_rate = 0.01, random_state = 0, early_stopping_round = 1000,
    #                    scale_pos_weight = 59145/23855)
    
    #X_Train, Y_Train = sm.fit_resample(X_Train, Y_Train)
    
    cat2.fit(X_Train, Y_Train, eval_set = [(X_Train, Y_Train), (X_Test, Y_Test)], verbose = False)
    
    pred = cat2.predict(X_Test)
    
    cat2_f1_scores.append(f1_score(Y_Test, pred))
    cat2_acc_scores.append(accuracy_score(Y_Test, pred)*100)
    cat2_models.append(cat2)
    
    
    print("Iteration {} Completed".format(i))
    i+=1

Iteration 1 Completed
Iteration 2 Completed
Iteration 3 Completed
Iteration 4 Completed
Iteration 5 Completed


In [31]:
print("CAT1 F1 Score: ", sum(cat1_f1_scores)/len(cat1_f1_scores))
print("CAT1 Accuracy: ", sum(cat1_acc_scores)/len(cat1_acc_scores), "%")
print("CAT2 F1 Score: ", sum(cat2_f1_scores)/len(cat2_f1_scores))
print("CAT2 Accuracy: ", sum(cat2_acc_scores)/len(cat2_acc_scores), "%")

CAT1 F1 Score:  0.6287971859703518
CAT1 Accuracy:  71.29488416464525 %
CAT2 F1 Score:  0.4259521729788747
CAT2 Accuracy:  73.58596572836204 %


In [None]:
'''
0.59616
SMOTE neighbours = 5
xgb = CatBoostClassifier(n_estimators = 10000, max_depth = 8, eval_metric = 'CrossEntropy',
                             bootstrap_type='Bernoulli',
                        learning_rate = 0.01, random_state = 0, early_stopping_rounds = 1000,
                        scale_pos_weight = 59145/23855)

Bayesian/MVS- 0.59497
Bernoulli - 0.59616

model = xgb_models[0]

plt.figure(figsize = (10,10))
plot_importance(model, max_num_features = 5)
plt.show()'''

In [32]:
test = df.iloc[83000:, :]
test1 = test[test['TypeofProduct'] == 0]
test1_ID = test1['ID']
test1.drop(drop_cols, axis = 1, inplace = True)

test2 = test[test['TypeofProduct'] == 1]
test2_ID = test2['ID']
test2.drop(drop_cols, axis = 1, inplace = True)

In [33]:
preds1 = np.argmax(cat1_models[0].predict_proba(test1) +
                  cat1_models[1].predict_proba(test1) +
                  cat1_models[2].predict_proba(test1) +
                  cat1_models[3].predict_proba(test1) +
                  cat1_models[4].predict_proba(test1),
                  axis = 1)
preds2 = np.argmax(cat2_models[0].predict_proba(test2) +
                  cat2_models[1].predict_proba(test2) +
                  cat2_models[2].predict_proba(test2) +
                  cat2_models[3].predict_proba(test2) +
                  cat2_models[4].predict_proba(test2),
                  axis = 1)


In [34]:
preds1 = pd.DataFrame(preds1)
preds1.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
30487,0
30488,1
30489,0
30490,1


In [41]:
preds1['ID'] = test1_ID.values
preds1['ID'] = preds1['ID'].apply(lambda x: int(x))
preds1.head()


Unnamed: 0,0,ID
0,0,578069
1,0,578070
2,0,578071
3,0,578072
4,0,578074


In [43]:
preds2 = pd.DataFrame(preds2)
preds2['ID'] = test2_ID.values
preds2['ID'] = preds2['ID'].apply(lambda x: int(x))
preds2.head()

Unnamed: 0,0,ID
0,1,578073
1,0,578075
2,1,578077
3,0,578078
4,0,578081


In [45]:
preds = pd.concat([preds1, preds2])
preds.shape

(47000, 2)

In [50]:
preds.sort_values('ID', inplace = True)

In [54]:
preds.head()

Unnamed: 0,0,ID
9687,0,280038
9688,1,280039
9689,0,280040
9690,0,280041
9691,0,280042


In [35]:
#preds = best_clf_rf.best_estimator_.predict(test)

In [48]:
sample = pd.read_csv('submission.csv', header = None)
sample.head()

Unnamed: 0,0,1
0,578069,0
1,578070,1
2,578071,1
3,578072,1
4,578073,0


In [51]:
sample.sort_values(0, inplace = True)

In [56]:
preds.index = sample.index

In [57]:
sample[1] = preds[0]
sample.head()

Unnamed: 0,0,1
17006,280038,0
17007,280039,1
17008,280040,0
17009,280041,0
17010,280042,0


In [58]:
sample[1].value_counts()

0    29277
1    17723
Name: 1, dtype: int64

In [59]:
#sample.to_csv('Submissions/ThirdDegreeBurn_15.csv', index = False, header=False)