In [37]:
options(warn=-1)
rm(list=ls())
library(ranger)
library(MASS)
library(readxl)
library(class)
library(gmodels)
library(dplyr)
library(SDMTools)
library(eqs2lavaan)
library(reshape2)
library(purrr)
library(tidyr)
library(ggplot2)
require(caTools)
library(fastDummies)
library(crossval)

In [38]:
df <- read.csv("small_cleaned_data.csv")
df <- select(df, -X)
names(df)

In [39]:
for (i in names(df %>% keep(is.factor))){
    print(i)
    print(length(unique(df[[i]])))
}

[1] "LoanTenor"
[1] 6
[1] "SectorDescription"
[1] 5
[1] "MariatlStatus"
[1] 4
[1] "MailThrough"
[1] 2
[1] "Qualification"
[1] 7
[1] "ProfessionDesc"
[1] 3
[1] "Month_and_Year"
[1] 83


In [40]:
# loan type is actualy a factor, so we change its type to factor
df$loantype <- as.factor(df$loantype)

# Now we crate dummy variables for factor variables
df <- fastDummies::dummy_columns(df)


# Now we remove original factor variables
df <- df[!names(df)  %in% names(df %>% keep(is.factor))]

In [41]:
df <- df[!names(df) == 'Month_and_Year']

In [42]:
table(df$DPD30)


     0      1 
101510  12283 

In [43]:
prop.table(table(df$DPD30))


        0         1 
0.8920584 0.1079416 

In [44]:
str(df)

'data.frame':	113793 obs. of  124 variables:
 $ LoanAmount                                  : num  50000 100000 120000 100000 50000 100000 50000 40000 40000 50000 ...
 $ EstablishedSinceMonths                      : int  36 72 204 144 24 120 36 36 120 60 ...
 $ CurrentAddressSinceMonths                   : int  240 84 240 36 60 120 12 48 60 120 ...
 $ DocumentCharge                              : int  1000 3000 3000 3000 1000 2500 1600 1000 1000 1600 ...
 $ EMI                                         : int  5420 10670 10800 10670 5420 10667 5417 4330 4330 5417 ...
 $ InterestRate                                : num  30 28 28 28 30 28 30 30 30 30 ...
 $ InsuranceRate                               : num  1.1 1.1 1.1 1.35 1.1 0 0 1.1 1.35 0 ...
 $ InsuranceAmount                             : int  716 1408 2228 1350 715 0 0 572 540 0 ...
 $ NoOfDependents                              : int  5 2 3 2 4 4 3 5 4 6 ...
 $ NetBusinessIncome                           : num  30315 40325 39750 22

In [45]:
# standardization numerical variables(except dummy variables)
for (i in names(df)){
    if (!mean(names(table(df[[i]])) == c('0', '1')) == 1){
        df[i] <- scale(df[i])
    }
}

# LDA

In [53]:
sample <-  sample.split(df$DPD30, SplitRatio = 0.8)
train <<- subset(df, sample == TRUE)
test  <<- subset(df, sample == FALSE)

# modeling (LDA)
a = 0.8
b = 0.2
r <<- lda(formula = DPD30 ~ ., data = train, prior=c(a,b))
plda <<-  predict(object = r, newdata = test)
base_line_accuracy <<- max(table(df$DPD30))/sum(table(df$DPD30))
pridiction_accuracy <<- sum(plda$class == test$DPD30)/nrow(test)

In [55]:
a = confusionMatrix(test$DPD30, plda$class)
a

In [10]:
zero_weight =  c()
one_weight =  c()
accuracy =  c()
FP  <- c()
TP <- c()
TN <- c()
FN <- c()
for (i in seq(0.001, 0.009, by = 0.005)){
    b = 1-i
        try({
            split_and_train(i, b)
            a = confusionMatrix(test$DPD30, plda$class)
#             P <- a["TP"] / (a["TP"] + a["FP"])
#             R <- a["TP"] / (a["TP"] + a['FN'])
    #         print(paste("Pricision: ", P))
    #         print(paste("Recall: ", R))
    #         print(paste("Accuracy: ", pridiction_accuracy))
            Pricisions <- append(Pricisions, P)
            Recalls <- append(Recalls, R)
            zero_weight <- append(zero_weight, i)
            one_weight <- append(one_weight, b)
            accuracy  <- append(accuracy, pridiction_accuracy)
            FP <- append(FP,a[1] )
            TP <- append(TP, a[2])
            TN <- append(TN, a[3])
            FN <- append(FN, a[4])
            
            silent = T})
    
}

Error in append(Pricisions, P) : object 'Pricisions' not found
Error in append(Pricisions, P) : object 'Pricisions' not found
Error in append(Pricisions, P) : object 'Pricisions' not found
Error in append(Pricisions, P) : object 'Pricisions' not found
Error in append(Pricisions, P) : object 'Pricisions' not found
Error in lda.default(x, grouping, ...) : 
  variable 118 appears to be constant within groups
Error in append(Pricisions, P) : object 'Pricisions' not found
Error in lda.default(x, grouping, ...) : 
  variable 119 appears to be constant within groups
Error in lda.default(x, grouping, ...) : 
  variables 118 119 appear to be constant within groups
Error in append(Pricisions, P) : object 'Pricisions' not found
Error in append(Pricisions, P) : object 'Pricisions' not found
Error in append(Pricisions, P) : object 'Pricisions' not found
Error in append(Pricisions, P) : object 'Pricisions' not found
Error in lda.default(x, grouping, ...) : 
  variable 119 appears to be constant with

In [11]:
d = data.frame(cbind(zero_weight, one_weight, accuracy, FP, TP, TN, FN))
d['Precision'] = d$TP /(d$TP + d$FP)
d['Recall'] = d$TP / (d$TP + d$FN)

In [12]:
var_to_bins <- c("LoanAmount", "EstablishedSinceMonths", "CurrentAddressSinceMonths", "DocumentCharge", 
                 "EMI", "InterestRate","InsuranceRate", "InsuranceAmount", "NoOfDependents", 
                 "NetBusinessIncome", "NetDisposableIncome")
for (i in var_to_bins){
    df[i] <- cut(df[[i]], 3, include.lowest=TRUE)
}
str(df)

'data.frame':	113793 obs. of  124 variables:
 $ LoanAmount                                  : Factor w/ 3 levels "[-1.41,-0.0636]",..: 1 2 3 2 1 2 1 1 1 1 ...
 $ EstablishedSinceMonths                      : Factor w/ 3 levels "[-1.68,-0.089]",..: 1 1 3 2 1 2 1 1 2 1 ...
 $ CurrentAddressSinceMonths                   : Factor w/ 3 levels "[-2.01,-0.732]",..: 3 1 3 1 1 2 1 1 1 2 ...
 $ DocumentCharge                              : Factor w/ 3 levels "[-1.15,0.912]",..: 1 2 2 2 1 2 1 1 1 1 ...
 $ EMI                                         : Factor w/ 3 levels "[-1.57,2.04]",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ InterestRate                                : Factor w/ 3 levels "[-4.46,-1.71]",..: 3 2 2 2 3 2 3 3 3 3 ...
 $ InsuranceRate                               : Factor w/ 3 levels "[-1.16,-0.146]",..: 2 2 2 3 2 1 1 2 3 1 ...
 $ InsuranceAmount                             : Factor w/ 3 levels "[-0.914,1.1]",..: 1 2 2 2 1 1 1 1 1 1 ...
 $ NoOfDependents                              : Factor 

In [13]:
# m = read.csv(file = "clipboard", sep = "\t")
# names(m) <- c('Pricisions','Recalls','zero_weight','one_weight','accuracy','FP','TP','TN','FN')
# spliting the data into train and test
split_and_train <- function(a, b){
    sample <-  sample.split(df$DPD30, SplitRatio = 0.8)
    train <<- subset(df, sample == TRUE)
    test  <<- subset(df, sample == FALSE)

    # modeling (LDA)
    r <<- lda(formula = DPD30 ~ ., data = train, prior=c(a,b))
    plda <<-  predict(object = r, newdata = test)
    base_line_accuracy <<- max(table(df$DPD30))/sum(table(df$DPD30))
    pridiction_accuracy <<- sum(plda$class == test$DPD30)/nrow(test)
#     print(paste("Our test accuracy is ", round(pridiction_accuracy, 4)))
#     print(paste("Base line accuracy", round(base_line_accuracy, 4)))
#     print(paste("Test accuracy", round(pridiction_accuracy, 4)))
#     print(paste("Our model is batter than base line (without modeling) by ", 
#                 round((pridiction_accuracy - base_line_accuracy)*100, 4), "%", sep=""))
}
zero_weight =  c()
one_weight =  c()
accuracy =  c()
FP  <- c()
TP <- c()
TN <- c()
FN <- c()
library(crossval)
for (i in seq(0.01, 0.99, by = 0.05)){
    b = 1-i
        try({
            split_and_train(i, b)
            a = confusion.matrix(test$DPD30, plda$class, threshold = 0.5)
#             P <- a["TP"] / (a["TP"] + a["FP"])
#             R <- a["TP"] / (a["TP"] + a['FN'])
    #         print(paste("Pricision: ", P))
    #         print(paste("Recall: ", R))
    #         print(paste("Accuracy: ", pridiction_accuracy))
            zero_weight <- append(zero_weight, i)
            one_weight <- append(one_weight, b)
            accuracy  <- append(accuracy, pridiction_accuracy)
            FP <- append(FP,a[1] )
            TP <- append(TP, a[2])
            TN <- append(TN, a[3])
            FN <- append(FN, a[4])            
            silent = T})
    
}

Error in lda.default(x, grouping, ...) : 
  variable 21 appears to be constant within groups
Error in lda.default(x, grouping, ...) : 
  variables 129 130 appear to be constant within groups
Error in lda.default(x, grouping, ...) : 
  variable 130 appears to be constant within groups
Error in append(Recalls, R) : object 'Recalls' not found
Error in append(Recalls, R) : object 'Recalls' not found
Error in lda.default(x, grouping, ...) : 
  variable 134 appears to be constant within groups
Error in lda.default(x, grouping, ...) : 
  variable 21 appears to be constant within groups
Error in append(Recalls, R) : object 'Recalls' not found
Error in append(Recalls, R) : object 'Recalls' not found
Error in append(Recalls, R) : object 'Recalls' not found
Error in lda.default(x, grouping, ...) : 
  variable 130 appears to be constant within groups
Error in lda.default(x, grouping, ...) : 
  variable 21 appears to be constant within groups
Error in lda.default(x, grouping, ...) : 
  variable 129

In [14]:
d = data.frame(cbind(zero_weight, one_weight, accuracy, FP, TP, TN, FN))
d['Precision'] = d$TP /(d$TP + d$FP)
d['Recall'] = d$TP / (d$TP + d$FN)
d

Precision,Recall
