### Balancing of data has to be done only on Train data and not on Test data (to avoid Bias in the model)

In [1]:
# importing libraries
import pandas as pd
from collections import Counter
from itertools import combinations
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import StandardScaler

In [2]:
# reading dataset
dataset = pd.read_csv("C:\\Users\\Administrator\\Desktop\\ML 360\\loan_train.csv")

In [3]:
# dataset summary
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Loan_ID             614 non-null    object 
 1   Gender              603 non-null    object 
 2   Married             611 non-null    object 
 3   Dependents          599 non-null    object 
 4   Education           614 non-null    object 
 5   Self_Employed       582 non-null    object 
 6   Applicant_Income    614 non-null    int64  
 7   Coapplicant_Income  614 non-null    float64
 8   Loan_Amount         592 non-null    float64
 9   Loan_Amount_Term    600 non-null    float64
 10  Credit_History      564 non-null    float64
 11  Property_Area       614 non-null    object 
 12  Loan_Status         614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [4]:
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
dataset.drop(['Loan_ID'], axis = 1, inplace = True)

In [6]:
# find NA value across columns
dataset.isnull().sum()

Gender                11
Married                3
Dependents            15
Education              0
Self_Employed         32
Applicant_Income       0
Coapplicant_Income     0
Loan_Amount           22
Loan_Amount_Term      14
Credit_History        50
Property_Area          0
Loan_Status            0
dtype: int64

In [7]:
# impute cat features with mode
for column in ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']:
    dataset[column].fillna(dataset[column].mode()[0], inplace=True)

In [8]:
# impute cat features with mean
dataset = dataset.fillna(dataset.mean())

In [9]:
# find NA value across columns
# dataset.isnull().sum()

In [10]:
# checking proportion of target variable across levels
dataset['Loan_Status'].value_counts(normalize = True)

Y    0.687296
N    0.312704
Name: Loan_Status, dtype: float64

In [11]:
# holding input variables in 'dup_dataset'
dup_dataset = dataset.drop(['Loan_Status'], axis = 1)

In [12]:
# holding input column names in 'cols'
cols = list(dup_dataset.columns)

In [13]:
num_cols = [x for x in dup_dataset.columns if dup_dataset[x].dtype in ['int64', 'float64']] # numerical columns
non_num_cols = [x for x in dup_dataset.columns if dup_dataset[x].dtype not in ['int64', 'float64']] # non-numerical columns

num_dataset = dup_dataset.drop(non_num_cols, axis = 1) # numerical dataset
non_num_dataset = dup_dataset.drop(num_cols, axis = 1) # non-numerical dataset

In [14]:
# Scaling numerical dataset (can also try normalization)
scaler = StandardScaler()
num_dataset_scaled = scaler.fit_transform(num_dataset) # format is array
num_dataset_scaled = pd.DataFrame(columns = num_cols, data = num_dataset_scaled) # converting the format to dataframe 
scaled_dataset = pd.concat([num_dataset_scaled, non_num_dataset], axis = 1) # concatenating with non-numerical features
# scaled_dataset.head()

In [15]:
# capturing indices of categorical features
non_num_col_indices = []

for item in list(scaled_dataset.columns):
    if item in non_num_cols:
        col_index = scaled_dataset.columns.get_loc(item)
        non_num_col_indices.append(col_index)

In [16]:
# selecting input and output data
X = scaled_dataset
y = dataset['Loan_Status']

In [17]:
#Instantiating SMOTENC algorith with 80% oversampling
smote_nc = SMOTENC(categorical_features = non_num_col_indices, random_state = 123, sampling_strategy = 0.7) 

# Fitting smote_nc
X_resampled, y_resampled = smote_nc.fit_resample(X, y)

In [18]:
# checking the percentage after resampling
print(sorted(Counter(y_resampled).items()))

[('N', 295), ('Y', 422)]


In [19]:
# collecting input column names
input_cols = num_cols + non_num_cols
# input_cols

In [20]:
new_dataset = pd.DataFrame(columns = input_cols, data = X_resampled) # creating a new dataset with sampled input features
new_dataset = pd.concat([new_dataset, pd.DataFrame(y_resampled)], axis = 1) # appending sampled target feature to new_dataset
new_dataset.rename(columns = {0: y.name}, inplace = True) # renaming the target variable 

In [21]:
new_dataset.head()

Unnamed: 0,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,0.072991,-0.554487,0.0,0.279851,0.411733,Male,No,0,Graduate,No,Urban,Y
1,-0.134412,-0.038732,-0.219273,0.279851,0.411733,Male,Yes,1,Graduate,No,Rural,N
2,-0.393747,-0.554487,-0.957641,0.279851,0.411733,Male,Yes,0,Graduate,Yes,Urban,Y
3,-0.462062,0.25198,-0.314547,0.279851,0.411733,Male,Yes,0,Not Graduate,No,Urban,Y
4,0.097728,-0.554487,-0.064454,0.279851,0.411733,Male,No,0,Graduate,No,Urban,Y


In [22]:
"""
SMOTE-NC Description:
    1. SMOTE-NC as it appears, is specifically used for Nominal and Continuous data types. 
    2. A data point is randomly selected from the minority class (used for oversampling) and another sample is randomly selected from the k specified neighbours(Defaul=5). The distance between the two is computed using Euclidean Distance Measure. 
    3. The synthetic instance is generated as a convex combination of the two chosen instances a, b and the Euclidean distance between the two.
    4. Please refer the below links for detailed information. 
        - https://www.cs.cmu.edu/afs/cs/project/jair/pub/volume16/chawla02a-html/node15.html#SECTION00061000000000000000
        - https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
      
class_imbalance Description:
    1. Class imbalance cannot be performed on data with missing values, therefore, imputed data needs to be parsed to the function
    2. This function at the moment cannot handle datetime variables
    3. First, the proportion of levels of the target feature is computed 
        3.1 If the difference of proportions for any two levels is > 0.15, only then we deal with class imbalance problem
    4. As SMOTENC works on nominal and continuous features, Scaling is performed on the numerical features before oversampling; 
       Normalization (on rows) can also be tried to compare the results
        4.1 Extract numerical columns & non-numerical columns
        4.2 Scale the numerical columns and then join them with the non-numerical columns
    5. Capture indices of non-numerical columns: parameter to SMOTENC 
    6. Fitted with minority classes with 0.7% and not 1%; can try with different values
"""


def class_imbalance(self, target):
    # checking proportion of target variable across levels
    prop_list = target.value_counts(normalize = True).tolist()
    prop_diff = [abs(a -b) for a, b in combinations(prop_list, 2)]
    
    if any(prop > 0.15 for prop in prop_diff):
    
        cols = list(self.columns)
        num_cols = [x for x in self.columns if self[x].dtype in ['int64', 'float64']] # numerical columns
        non_num_cols = [x for x in self.columns if self[x].dtype not in ['int64', 'float64']] # non-numerical columns

        num_dataset = self.drop(non_num_cols, axis = 1) # numerical dataset
        non_num_dataset = self.drop(num_cols, axis = 1) # non-numerical dataset

        # Scaling numerical dataset (can also try normalization)
        scaler = StandardScaler()
        num_dataset_scaled = scaler.fit_transform(num_dataset) # format is array
        num_dataset_scaled = pd.DataFrame(columns = num_cols, data = num_dataset_scaled) # converting the format to dataframe 
        scaled_dataset = pd.concat([num_dataset_scaled, non_num_dataset], axis = 1) # concatenating with non-numerical features

        # capturing indices of categorical features
        non_num_col_indices = []
        for item in list(scaled_dataset.columns):
            if item in non_num_cols:
                col_index = scaled_dataset.columns.get_loc(item)
                non_num_col_indices.append(col_index)

        # selecting input and output data
        X = scaled_dataset
        y = target

        #Instantiating SMOTENC algorith with 80% oversampling
        smote_nc = SMOTENC(categorical_features = non_num_col_indices, random_state = 123, sampling_strategy = 0.7) 

        # Fitting smote_nc
        X_resampled, y_resampled = smote_nc.fit_resample(X, y)

        # collecting input column names
        input_cols = num_cols + non_num_cols

        new_dataset = pd.DataFrame(columns = input_cols, data = X_resampled) # creating a new dataset with sampled input features
        new_dataset = pd.concat([new_dataset, pd.DataFrame(y_resampled)], axis = 1) # appending sampled target feature to new_dataset
        new_dataset.rename(columns = {0: y.name}, inplace = True) # renaming the target variable 
        return new_dataset          
    else:
        new_dataset = pd.concat([self, target], axis = 1)
        return new_dataset
        

In [30]:
# this is an imputed dataset
dataset.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [26]:
# extracting inputs and output features from the dataset
inputs = dataset.drop("Loan_Status", axis = 1)
output = dataset['Loan_Status']

In [27]:
class_imbalance(inputs, output)

Unnamed: 0,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,0.072991,-0.554487,0.000000,0.279851,0.411733,Male,No,0,Graduate,No,Urban,Y
1,-0.134412,-0.038732,-0.219273,0.279851,0.411733,Male,Yes,1,Graduate,No,Rural,N
2,-0.393747,-0.554487,-0.957641,0.279851,0.411733,Male,Yes,0,Graduate,Yes,Urban,Y
3,-0.462062,0.251980,-0.314547,0.279851,0.411733,Male,Yes,0,Not Graduate,No,Urban,Y
4,0.097728,-0.554487,-0.064454,0.279851,0.411733,Male,No,0,Graduate,No,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
712,-0.562379,-0.554487,-1.184483,0.279851,0.411733,Male,No,0,Graduate,No,Urban,N
713,-0.267382,-0.554487,-0.616413,0.279851,-2.428760,Female,No,0,Graduate,No,Semiurban,N
714,-0.082678,-0.034872,0.140444,0.279851,-2.428760,Male,Yes,0,Graduate,No,Urban,N
715,-0.220638,-0.370755,-0.098280,0.279851,0.411733,Male,Yes,2,Graduate,No,Urban,N
