In [30]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neural_network import MLPClassifier
import warnings
warnings.simplefilter('ignore')

- Submission_1 : Single XGB with random state 0 with SMOTE
- Submission_2 : Combination of 5 XGB and 5 Random Forest with Random state 0 with SMOTE
- Submission_3 : Combination of 5 Random Forest with Random state 0 with SMOTE
- Submission_4 : Submission 3 with Annual Income Dropped
- Submission_5 : Submission 2 with Annual Income Dropped

In [4]:
train = pd.read_csv('TrainingData.csv')
test = pd.read_csv('testX.csv')

df = pd.concat([train, test], axis = 0)

df.shape

  train = pd.read_csv('TrainingData.csv')
  test = pd.read_csv('testX.csv')


(130000, 53)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130000 entries, 0 to 46999
Data columns (total 53 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   application_key  130000 non-null  int64  
 1   mvar1            130000 non-null  object 
 2   mvar2            120787 non-null  float64
 3   mvar3            129171 non-null  float64
 4   mvar4            129171 non-null  float64
 5   mvar5            129171 non-null  float64
 6   mvar6            130000 non-null  object 
 7   mvar7            130000 non-null  object 
 8   mvar8            130000 non-null  object 
 9   mvar9            130000 non-null  object 
 10  mvar10           130000 non-null  object 
 11  mvar11           130000 non-null  object 
 12  mvar12           130000 non-null  object 
 13  mvar13           130000 non-null  object 
 14  mvar14           130000 non-null  int64  
 15  mvar15           130000 non-null  object 
 16  mvar16           130000 non-null  objec

In [6]:
columns = ['ID',
           'Credit Worthiness Score',
           'Number and Riskiness of Credit Enquiry',
           'Default Severity (Any Loan)',
           'Default Severity (Auto Loan)',
           'Default Severity (Education Loan)',
           'Min Cred Available (Revolving Credit Cards)',
           'Max Cred Available (Active Credit Lines)',
           'Max Cred Available (Active Revolving Credit Cards)',
           'Total Available Credit (Cards with 1 missed payment)',
           'Total Credit Available',
           'Dues Collected Post Default',
           'Total Amount Due (Active Cards)',
           'Credit Amount Paid (Prev Year)',
           'Annual Income',
           'Property Value',
           'Full Credit Utilisation (Revolving Credit Cards)',
           'Full Credit Utilisation (Credit Cards)',
           'Full Credit Utilisation (Credit Lines)',
           '>75 percent Credit Utilisation (Credit Cards)',
           '>75 percent Credit Utilisation (Credit Lines)',
           'Average Utilisation (Revolving Credit Cards)',
           'Average Utilisation Last 2 Years (All Credit Lines)',
           'Average Utilisation Last Year (All Credit Cards)',
           'Average Utilisation Last 6 Months with 1 Missed Payment (Credit Cards)',
           'Average Tenure (Active Revolving Credit Cards)',
           'Tenure of Oldest Active Credit Card',
           'Tenure of Oldest Active Revolving Credit Card',
           'Days Since Last Missed Payment',
           'Tenure of Oldest Credit Line',
           'Max Tenure (Auto Loans)',
           'Max Tenure (Education Loans)',
           'Sum of Tenures (Active Credit Cards)',
           'Stay Duration of Current Residence',
           'Credit Lines with 1 Missed Payment (Last 6 Months)',
           'Revolving Credit Cards with 1 Missed Payment (Last 2 Years)',
           'Active Credit Lines',
           'Credit Cards with >2Y Tenure',
           'Credit Lines activated Prev 2 Years',
           'Credit Lines with Current Delinquency',
           'Utilization on Active Education Loans',
           'Utilization on Active Auto Loans',
           'Financial Stress Index',
           'Credit Lines with No Missed Payments (Prev 2 Years)',
           'Ratio(Max Amount Due : Sum of Amounts Due)',
           'Mortgage Loans (With 2 Missed Payements)',
           'Auto Loans (With 2 Missed Payments)',
           'Type of Product',
           'Int Value for an Application',
           'Bucketized Credit Worthiness Score',
           'Compound of mvar49 and mvar48',
           'mvar51',
           'default_ind']

In [7]:
df.columns = columns

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130000 entries, 0 to 46999
Data columns (total 53 columns):
 #   Column                                                                  Non-Null Count   Dtype  
---  ------                                                                  --------------   -----  
 0   ID                                                                      130000 non-null  int64  
 1   Credit Worthiness Score                                                 130000 non-null  object 
 2   Number and Riskiness of Credit Enquiry                                  120787 non-null  float64
 3   Default Severity (Any Loan)                                             129171 non-null  float64
 4   Default Severity (Auto Loan)                                            129171 non-null  float64
 5   Default Severity (Education Loan)                                       129171 non-null  float64
 6   Min Cred Available (Revolving Credit Cards)                          

### Feature Understanding (Explanations and Stuff to Look into)

- Auto Loans: Loans where Vehicles are kept as collateral
- Revolving Credit: You have a fixed amount that you can borrow in some fixed time frame. After that the amount you can borrow replenishes to the full amount. Although you will have a maximum limit. For example, you can borrow 100 rupees a month max and the overall limit on that credit line is 500 rupees. So if your overall borrowed amount is 460 rupees and for that month you havenn't borrowed anything so far, you will be able to borrow only 40 rupees even though the monthly limit is 100
- Difference between Credit Worthiness Score and its bucketized version?
- Difference in statistics between train and test (Need for Adverserial Validation?)

### Feature Ideas and Experiments
- Binary Variable for checking if the person has moved into current residence after issuing any credit lines
- Credit Amount Paid / Number of Credit Lines with No Missed Payments
- Max Cred Available vs Default Severity
- Separate modeling for 0 Default Severity vs Non-Zero Default Severity
- KNN Missing Values Fill In
- Property Value vs Annual Income (Some Indicator of whether or not a person spends too much for their Income)
- 75% utilisation / Full Utilisation

In [9]:
df.head(10)

Unnamed: 0,ID,Credit Worthiness Score,Number and Riskiness of Credit Enquiry,Default Severity (Any Loan),Default Severity (Auto Loan),Default Severity (Education Loan),Min Cred Available (Revolving Credit Cards),Max Cred Available (Active Credit Lines),Max Cred Available (Active Revolving Credit Cards),Total Available Credit (Cards with 1 missed payment),Total Credit Available,Dues Collected Post Default,Total Amount Due (Active Cards),Credit Amount Paid (Prev Year),Annual Income,Property Value,Full Credit Utilisation (Revolving Credit Cards),Full Credit Utilisation (Credit Cards),Full Credit Utilisation (Credit Lines),>75 percent Credit Utilisation (Credit Cards),>75 percent Credit Utilisation (Credit Lines),Average Utilisation (Revolving Credit Cards),Average Utilisation Last 2 Years (All Credit Lines),Average Utilisation Last Year (All Credit Cards),Average Utilisation Last 6 Months with 1 Missed Payment (Credit Cards),Average Tenure (Active Revolving Credit Cards),Tenure of Oldest Active Credit Card,Tenure of Oldest Active Revolving Credit Card,Days Since Last Missed Payment,Tenure of Oldest Credit Line,Max Tenure (Auto Loans),Max Tenure (Education Loans),Sum of Tenures (Active Credit Cards),Stay Duration of Current Residence,Credit Lines with 1 Missed Payment (Last 6 Months),Revolving Credit Cards with 1 Missed Payment (Last 2 Years),Active Credit Lines,Credit Cards with >2Y Tenure,Credit Lines activated Prev 2 Years,Credit Lines with Current Delinquency,Utilization on Active Education Loans,Utilization on Active Auto Loans,Financial Stress Index,Credit Lines with No Missed Payments (Prev 2 Years),Ratio(Max Amount Due : Sum of Amounts Due),Mortgage Loans (With 2 Missed Payements),Auto Loans (With 2 Missed Payments),Type of Product,Int Value for an Application,Bucketized Credit Worthiness Score,Compound of mvar49 and mvar48,mvar51,default_ind
0,230032,1696,1.6541,0.0,0.0,0.0,0,6015,322,40369,18414,missing,6423,3067,123875,missing,1,1,1,3,3,94.78,8987.18,,72.25,1462,4532,2890,61,4532,1095,3376,625,1.1667,1,0,8,10,4,1,73.78,82.547,0.08696,10,0.63899,na,0,C,10,770,4,3080,0.0
1,230033,1846,0.8095,0.0,0.0,0.0,102,7532,3171,18234,13664,missing,765,1931,42613,missing,0,0,0,0,0,74.25,953.06,953.06,4.8,1028,2099,2099,30386,2281,missing,2251,169,0.4167,0,0,8,0,2,0,99.129,missing,0,13,0.63836,na,na,L,732,437,5,2185,1.0
2,230034,1745,0.4001,0.0,0.0,0.0,missing,2536,missing,missing,2536,missing,missing,missing,76109,missing,na,na,na,0,0,,,,,missing,missing,missing,669,4623,3772,missing,missing,25.0833,0,na,1,na,1,0,missing,29.29,0,1,1.0,na,0,C,89,795,4,3180,1.0
3,230035,1739,0.2193,0.0,0.0,0.0,1982,26440,4955,20316,37013,missing,0,0,84235,missing,0,0,0,0,0,,0.0,,0.0,1308,2525,791,91,5992,missing,3741,215,10.3333,0,0,3,3,2,0,96.272,missing,0.15385,3,0.53241,0,0,L,3,755,4,3020,0.0
4,230036,1787,0.0118,0.225,0.0,0.0,5451,5494,5494,7987,4696,missing,2257,27815,123875,524848,0,0,0,0,0,20.51,796.67,,28.72,801,2281,2281,487,2707,missing,1947,158,0.9167,0,0,2,3,2,0,115.019,missing,0,1,0.92665,na,na,L,5,425,4,1700,0.0
5,230037,1579,,3.502,0.0,0.0,missing,missing,missing,missing,0,199,missing,missing,77298,missing,na,na,na,0,0,,,,,missing,missing,missing,183,973,missing,missing,missing,1.3333,0,na,2,na,2,na,missing,missing,1.5,0,,na,na,C,35,455,4,1820,1.0
6,230038,1818,0.4001,0.0,0.0,0.0,missing,1088,missing,1536,1498,missing,448,2331,123875,missing,na,0,0,0,0,,,,29.16,1034,760,missing,30386,3437,missing,791,34,9.3333,0,na,2,0,0,0,88.171,missing,0,2,0.87224,na,0,C,2,392,5,1960,1.0
7,230039,na,,,,,missing,missing,missing,missing,missing,missing,missing,missing,198200,1263525,na,na,na,0,na,,,,,missing,missing,missing,missing,missing,missing,missing,missing,0.0833,na,na,na,na,na,na,missing,missing,missing,na,,na,na,C,2,615,#VALUE!,#VALUE!,0.0
8,230040,1836,0.1358,0.0,0.0,0.0,347,38964,17828,70729,65843,missing,16298,18877,109010,missing,0,0,0,1,1,32.02,0.0,0.0,32.92,3180,8943,8943,1490,12075,missing,missing,1673,5.9167,0,na,6,12,2,0,missing,missing,0,10,0.89868,0,0,L,5,821,5,4105,1.0
9,230041,1839,0.1981,0.0,0.0,0.0,793,6131,6045,48959,31640,missing,10565,14282,99100,146668,0,0,0,1,1,46.17,,,49.07,1701,8912,2707,61,11193,4593,missing,615,1.75,0,na,10,7,0,0,missing,45.59,0.08824,14,0.33834,na,0,L,3247,408,5,2040,0.0


In [10]:
df.replace('missing', np.nan, inplace = True)
df.replace('na', np.nan, inplace = True)
df.replace('#VALUE!', np.nan, inplace = True)

In [11]:
df.head(10)

Unnamed: 0,ID,Credit Worthiness Score,Number and Riskiness of Credit Enquiry,Default Severity (Any Loan),Default Severity (Auto Loan),Default Severity (Education Loan),Min Cred Available (Revolving Credit Cards),Max Cred Available (Active Credit Lines),Max Cred Available (Active Revolving Credit Cards),Total Available Credit (Cards with 1 missed payment),Total Credit Available,Dues Collected Post Default,Total Amount Due (Active Cards),Credit Amount Paid (Prev Year),Annual Income,Property Value,Full Credit Utilisation (Revolving Credit Cards),Full Credit Utilisation (Credit Cards),Full Credit Utilisation (Credit Lines),>75 percent Credit Utilisation (Credit Cards),>75 percent Credit Utilisation (Credit Lines),Average Utilisation (Revolving Credit Cards),Average Utilisation Last 2 Years (All Credit Lines),Average Utilisation Last Year (All Credit Cards),Average Utilisation Last 6 Months with 1 Missed Payment (Credit Cards),Average Tenure (Active Revolving Credit Cards),Tenure of Oldest Active Credit Card,Tenure of Oldest Active Revolving Credit Card,Days Since Last Missed Payment,Tenure of Oldest Credit Line,Max Tenure (Auto Loans),Max Tenure (Education Loans),Sum of Tenures (Active Credit Cards),Stay Duration of Current Residence,Credit Lines with 1 Missed Payment (Last 6 Months),Revolving Credit Cards with 1 Missed Payment (Last 2 Years),Active Credit Lines,Credit Cards with >2Y Tenure,Credit Lines activated Prev 2 Years,Credit Lines with Current Delinquency,Utilization on Active Education Loans,Utilization on Active Auto Loans,Financial Stress Index,Credit Lines with No Missed Payments (Prev 2 Years),Ratio(Max Amount Due : Sum of Amounts Due),Mortgage Loans (With 2 Missed Payements),Auto Loans (With 2 Missed Payments),Type of Product,Int Value for an Application,Bucketized Credit Worthiness Score,Compound of mvar49 and mvar48,mvar51,default_ind
0,230032,1696.0,1.6541,0.0,0.0,0.0,0.0,6015.0,322.0,40369.0,18414.0,,6423.0,3067.0,123875,,1.0,1.0,1.0,3,3.0,94.78,8987.18,,72.25,1462.0,4532.0,2890.0,61.0,4532.0,1095.0,3376.0,625.0,1.1667,1.0,0.0,8.0,10.0,4.0,1.0,73.78,82.547,0.08696,10.0,0.63899,,0.0,C,10,770,4.0,3080.0,0.0
1,230033,1846.0,0.8095,0.0,0.0,0.0,102.0,7532.0,3171.0,18234.0,13664.0,,765.0,1931.0,42613,,0.0,0.0,0.0,0,0.0,74.25,953.06,953.06,4.8,1028.0,2099.0,2099.0,30386.0,2281.0,,2251.0,169.0,0.4167,0.0,0.0,8.0,0.0,2.0,0.0,99.129,,0.0,13.0,0.63836,,,L,732,437,5.0,2185.0,1.0
2,230034,1745.0,0.4001,0.0,0.0,0.0,,2536.0,,,2536.0,,,,76109,,,,,0,0.0,,,,,,,,669.0,4623.0,3772.0,,,25.0833,0.0,,1.0,,1.0,0.0,,29.29,0.0,1.0,1.0,,0.0,C,89,795,4.0,3180.0,1.0
3,230035,1739.0,0.2193,0.0,0.0,0.0,1982.0,26440.0,4955.0,20316.0,37013.0,,0.0,0.0,84235,,0.0,0.0,0.0,0,0.0,,0.0,,0.0,1308.0,2525.0,791.0,91.0,5992.0,,3741.0,215.0,10.3333,0.0,0.0,3.0,3.0,2.0,0.0,96.272,,0.15385,3.0,0.53241,0.0,0.0,L,3,755,4.0,3020.0,0.0
4,230036,1787.0,0.0118,0.225,0.0,0.0,5451.0,5494.0,5494.0,7987.0,4696.0,,2257.0,27815.0,123875,524848.0,0.0,0.0,0.0,0,0.0,20.51,796.67,,28.72,801.0,2281.0,2281.0,487.0,2707.0,,1947.0,158.0,0.9167,0.0,0.0,2.0,3.0,2.0,0.0,115.019,,0.0,1.0,0.92665,,,L,5,425,4.0,1700.0,0.0
5,230037,1579.0,,3.502,0.0,0.0,,,,,0.0,199.0,,,77298,,,,,0,0.0,,,,,,,,183.0,973.0,,,,1.3333,0.0,,2.0,,2.0,,,,1.5,0.0,,,,C,35,455,4.0,1820.0,1.0
6,230038,1818.0,0.4001,0.0,0.0,0.0,,1088.0,,1536.0,1498.0,,448.0,2331.0,123875,,,0.0,0.0,0,0.0,,,,29.16,1034.0,760.0,,30386.0,3437.0,,791.0,34.0,9.3333,0.0,,2.0,0.0,0.0,0.0,88.171,,0.0,2.0,0.87224,,0.0,C,2,392,5.0,1960.0,1.0
7,230039,,,,,,,,,,,,,,198200,1263525.0,,,,0,,,,,,,,,,,,,,0.0833,,,,,,,,,,,,,,C,2,615,,,0.0
8,230040,1836.0,0.1358,0.0,0.0,0.0,347.0,38964.0,17828.0,70729.0,65843.0,,16298.0,18877.0,109010,,0.0,0.0,0.0,1,1.0,32.02,0.0,0.0,32.92,3180.0,8943.0,8943.0,1490.0,12075.0,,,1673.0,5.9167,0.0,,6.0,12.0,2.0,0.0,,,0.0,10.0,0.89868,0.0,0.0,L,5,821,5.0,4105.0,1.0
9,230041,1839.0,0.1981,0.0,0.0,0.0,793.0,6131.0,6045.0,48959.0,31640.0,,10565.0,14282.0,99100,146668.0,0.0,0.0,0.0,1,1.0,46.17,,,49.07,1701.0,8912.0,2707.0,61.0,11193.0,4593.0,,615.0,1.75,0.0,,10.0,7.0,0.0,0.0,,45.59,0.08824,14.0,0.33834,,0.0,L,3247,408,5.0,2040.0,0.0


In [12]:
df.reset_index(drop = True, inplace = True)

In [13]:
df.fillna(-1, inplace = True)

In [14]:
def convert_to_int(entry):
    return float(entry)

for col in df.columns:
    if col != 'Type of Product':
        df[col] = df[col].apply(convert_to_int)
    
df['Credit Worthiness Score'][0]

1696.0

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130000 entries, 0 to 129999
Data columns (total 53 columns):
 #   Column                                                                  Non-Null Count   Dtype  
---  ------                                                                  --------------   -----  
 0   ID                                                                      130000 non-null  float64
 1   Credit Worthiness Score                                                 130000 non-null  float64
 2   Number and Riskiness of Credit Enquiry                                  130000 non-null  float64
 3   Default Severity (Any Loan)                                             130000 non-null  float64
 4   Default Severity (Auto Loan)                                            130000 non-null  float64
 5   Default Severity (Education Loan)                                       130000 non-null  float64
 6   Min Cred Available (Revolving Credit Cards)                         

In [16]:
df['default_ind'].value_counts()

 0.0    59145
-1.0    47000
 1.0    23855
Name: default_ind, dtype: int64

In [17]:
le = LabelEncoder()
df['Type of Product'] = le.fit_transform(df['Type of Product'])

In [18]:
import re
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [19]:
train = df.iloc[:83000, :]

X_data = train.drop(['ID', 'default_ind', 'AnnualIncome'], axis = 1)
y_data = train['default_ind']

In [45]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X_data, y_data, test_size = 0.15, stratify = y_data, random_state=0)

nn = MLPClassifier(hidden_layer_sizes=(16, 8), learning_rate='adaptive', random_state=0, verbose = True,
                   early_stopping=True, validation_fraction=0.1, learning_rate_init=1e-5, n_iter_no_change=25)
nn.fit(X_Train, Y_Train)

Iteration 1, loss = 14.01947988
Validation score: 0.600992
Iteration 2, loss = 14.06049578
Validation score: 0.601276
Iteration 3, loss = 14.12413610
Validation score: 0.597449
Iteration 4, loss = 14.20216158
Validation score: 0.596456
Iteration 5, loss = 14.24928705
Validation score: 0.598441
Iteration 6, loss = 14.23369374
Validation score: 0.598866
Iteration 7, loss = 14.14374769
Validation score: 0.603685
Iteration 8, loss = 13.94603901
Validation score: 0.608505
Iteration 9, loss = 13.79953441
Validation score: 0.610631
Iteration 10, loss = 13.63430329
Validation score: 0.616017
Iteration 11, loss = 13.44011252
Validation score: 0.619419
Iteration 12, loss = 13.22330253
Validation score: 0.623671
Iteration 13, loss = 13.05463904
Validation score: 0.629624
Iteration 14, loss = 12.83028220
Validation score: 0.636145
Iteration 15, loss = 12.64280097
Validation score: 0.641106
Iteration 16, loss = 12.47654599
Validation score: 0.649327
Iteration 17, loss = 12.34645656
Validation score

In [46]:
nn.loss_

11.27847054212126

In [47]:
preds = nn.predict(X_Test)

print("Accuracy Score: ", accuracy_score(preds, Y_Test)*100)
print("F1 Score: ", f1_score(preds, Y_Test))

Accuracy Score:  67.34939759036145
F1 Score:  0.3851157162305249


At this point I've just encountered a huge WTF moment with Sklearn's MLP Classifier. WTH is it even calculating early stopping on. Not just that, there is legit no minima or maxima at the point where it claims to have reached an optimum. This is completely useless and was a waste of time. I'm shifting to Pytorch, F this.

In [48]:
test = df.iloc[83000:, :]
test = test.drop(['ID', 'default_ind', 'AnnualIncome'], axis = 1)

In [None]:
#Logic for Prediction

In [56]:
sample = pd.read_csv('submission.csv', header = None)
sample.head()

Unnamed: 0,0,1
0,578069,0
1,578070,1
2,578071,1
3,578072,1
4,578073,0


In [57]:
sample[1] = preds
sample.head()

Unnamed: 0,0,1
0,578069,0
1,578070,0
2,578071,0
3,578072,0
4,578073,0


In [58]:
sample[1].value_counts()

0    34808
1    12192
Name: 1, dtype: int64

In [59]:
#sample.to_csv('Submissions/ThirdDegreeBurn_5.csv', index = False, header=False)