# SBAloans Big Data Modelling

In [1]:
#Checking file placement
import os 
os.getcwd()

'C:\\Users\\wowan\\OneDrive\\Skrivebord Bærebar\\BigData Exam\\bigdata'

## Import

In [2]:
#Importing modules and the processed .csv file, as done in the preprocessing phase.

# Data handling
import pandas as pd
import numpy as np

#training and test set
from sklearn.model_selection import train_test_split

# Evaluating classifiers
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

#balancing dataset with SMOTE
from imblearn.over_sampling import SMOTE 

#Dummy classifier
from sklearn.dummy import DummyClassifier

# Accuracy score for training- and test set
from sklearn.metrics import accuracy_score


# Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# For Scaling the dataset
from sklearn.preprocessing import MinMaxScaler

#Logistic regression // Cross validation
from sklearn.linear_model import LogisticRegressionCV

# Visulaization of data
from sklearn import tree
import seaborn as sns
import matplotlib.pyplot as plt

#Classification models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


In [3]:
#Importing the processed .csv file

SBAloan_df = pd.read_csv('SBAnational_processed.csv', sep=',', dtype='unicode')

#Displaying data table
SBAloan_df.head()

Unnamed: 0.1,Unnamed: 0,State,BankState,NAICS,ApprovalFY,Term,NoEmp,NewExist,FranchiseCode,UrbanRural,LowDoc,DisbursementGross,MIS_Status
0,143841,20,25,45,2007,60,6,1,0,1,0,79500,0
1,143842,14,53,44,2007,60,1,2,0,1,0,35000,0
2,143845,14,53,44,2007,78,1,2,0,1,0,39737,0
3,143846,1,29,42,2007,58,20,1,0,1,0,100000,1
4,143851,45,2,51,2007,13,3,1,0,2,0,50000,0


### Defining target variable and baseline

In [4]:
#We start by creating a new DataFrame 'X' by selecting the columsn from the other DataFrame SBAloan_df
#We are leaving out Variable 'ApprovalFY' as it is not relevant

#Defining x
x = SBAloan_df[['State', 'BankState', 'NAICS', 'Term', 'NoEmp', 'NewExist', 'FranchiseCode', 'UrbanRural', 'LowDoc', 'DisbursementGross']]

#Defining Y (target variable) for machine learning
y = SBAloan_df['MIS_Status']

#Showing attributes
x.columns

Index(['State', 'BankState', 'NAICS', 'Term', 'NoEmp', 'NewExist',
       'FranchiseCode', 'UrbanRural', 'LowDoc', 'DisbursementGross'],
      dtype='object')

### Creating training- and test split

In [40]:
#Creating training- and test split of the data, with function from sklearn.model library

#splitting code intro training and test set, with 70% of data used for training and 30% for testing
#random seed is set to 42, for repoducibillity
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

x_train
x_test

Unnamed: 0,State,BankState,NAICS,Term,NoEmp,NewExist,FranchiseCode,UrbanRural,LowDoc,DisbursementGross
83743,21,29,23,144,1,1,0,2,0,49965
123822,7,16,42,36,3,1,0,2,0,689737
161120,6,17,11,12,5,2,0,1,0,3000000
131852,9,8,54,120,6,1,0,1,0,615000
74928,9,36,81,60,0,2,0,1,0,40000
...,...,...,...,...,...,...,...,...,...,...
2901,6,17,44,47,1,1,0,1,0,136496
134806,3,40,62,78,3,1,0,2,0,35000
128280,38,19,23,36,12,1,0,2,0,21400
5627,10,26,48,60,4,2,0,2,0,135644


###  Baselinemodel and evaluating accuracy of training- and test set

In [41]:
#Creating Dummy classificer, that trains on the training data, for use to predict target variable values on the test data.
#Predicted values are stored in DummyClass_y_prediction

DummyClass = DummyClassifier()
DummyClass.fit(x_train, y_train)
DummyClass_y_prediction = DummyClass.predict(x_test)

#Predicted values are stored in DummyClass_y_prediction



In [42]:
#Printing accuracy of the model on both training set and the test set.
#Accuracy indicates the proportion of correctly predicted instances in the model.

DummyClass_y_train_prediction = DummyClass.predict(x_train)
DummyClass_y_test_prediction = DummyClass.predict(x_test)

print(f"Accuracy training set: {accuracy_score(y_train, DummyClass_y_train_prediction):.3f}")
print(f"Accuracy test set: {accuracy_score(y_test, DummyClass_y_test_prediction):.3f}")


Accuracy training set: 0.675
Accuracy test set: 0.681


* As seen above, training set = 67,5% of the instances in the training set were predicted correctly by the dummy classifier

* As seen above, Test set = 68.1% of the instances in the test set were predicted correctly by the dummy classifier


In [43]:
#Now checking the balance of the data set
print(y.dtypes)

# Converting any NaN to numeric since dtype was an object
y = pd.to_numeric(y, errors='coerce')

# Calculate the sum of numeric values and divide by the total number of element
result = sum(y) / len(y)

print(result)

int64
0.3234005739108184


* The result shows, that 32.34%  of the instances in the dataset belong to positive class

* This indicates that that the dataset needs to be balance, since it could impact the performance of the model


In [44]:
#First creating confusion_matrix, to compute matrix for classification model
conf_matrix = confusion_matrix(y_test, DummyClass_y_prediction)
print(conf_matrix)


#[TN,    FP]
#[FN     TP]

#High number of True Positives and True Negatives = good performance
#low number of False Positives and False Negatives = good performance


[[33538     0]
 [15704     0]]


As seen:

* True Negatives (TN): 27958 instances were correctly predicted as the negative class (0).

* False Negatives (FN): 13077 instances were incorrectly predicted as the negative class (0) when they were actually the positive class (1).



### Balancing dataset with SMOTE

In [45]:
smote = SMOTE(random_state=42)
x_SMOTE, y_SMOTE = smote.fit_resample(x,y)

result = sum(y_SMOTE)/len(y_SMOTE)
print(result)

0.5


SMOTE was applied to oversample, by generating synthetic examples.
the result of '0.5' suggest that the class distribution is now balanced
* With approxmitly equal number of instances for both positive and negative classes.
* This helps prevent the model from being biased toward the majority class

In [46]:
#Now where SMOTE has balanced the dataset, new training and data set, will be made

x_train, x_test, y_train, y_test = train_test_split(x_SMOTE, y_SMOTE, test_size=0.30, random_state=42)

x_train

Unnamed: 0,State,BankState,NAICS,Term,NoEmp,NewExist,FranchiseCode,UrbanRural,LowDoc,DisbursementGross
45010,33.0,17.0,53.0,42.0,1.0,1.0,0.0,2.0,0.0,49505.0
48434,13.0,48.0,52.0,64.0,3.0,1.0,0.0,1.0,0.0,59297.0
29816,12.0,30.0,81.0,18.0,3.0,1.0,0.0,2.0,0.0,45964.0
140273,14.0,53.0,23.0,24.0,2.0,2.0,0.0,1.0,0.0,350000.0
84152,2.0,25.0,23.0,84.0,2.0,2.0,0.0,1.0,0.0,5000.0
...,...,...,...,...,...,...,...,...,...,...
119879,10.0,24.0,44.0,84.0,2.0,2.0,0.0,2.0,0.0,58500.0
103694,30.0,20.0,54.0,36.0,4.0,1.0,0.0,2.0,0.0,25000.0
131932,11.0,39.0,44.0,40.0,4.0,1.0,0.0,2.0,0.0,35000.0
146867,8.0,22.0,62.0,84.0,5.0,1.0,0.0,1.0,0.0,70000.0


In [47]:
#removing decimals
x_train = x_train.astype(int)

#displaying data again
x_train


Unnamed: 0,State,BankState,NAICS,Term,NoEmp,NewExist,FranchiseCode,UrbanRural,LowDoc,DisbursementGross
45010,33,17,53,42,1,1,0,2,0,49505
48434,13,48,52,64,3,1,0,1,0,59297
29816,12,30,81,18,3,1,0,2,0,45964
140273,14,53,23,24,2,2,0,1,0,350000
84152,2,25,23,84,2,2,0,1,0,5000
...,...,...,...,...,...,...,...,...,...,...
119879,10,24,44,84,2,2,0,2,0,58500
103694,30,20,54,36,4,1,0,2,0,25000
131932,11,39,44,40,4,1,0,2,0,35000
146867,8,22,62,84,5,1,0,1,0,70000


In [48]:
#Creating Dummy classificer, that trains on the training data with new SMOTE dataset
DummyClass = DummyClassifier()
DummyClass.fit(x_train, y_train)
DummyClass_y_prediction = DummyClass.predict(x_test)

#Printing accuracy of the model on both training set and the test set after applying SMOTE
DummyClass_y_train_prediction = DummyClass.predict(x_train)
DummyClass_y_test_prediction = DummyClass.predict(x_test)

print(f"Accuracy training set: {accuracy_score(y_train, DummyClass_y_train_prediction):.3f}")
print(f"Accuracy test set: {accuracy_score(y_test, DummyClass_y_test_prediction):.3f}")


Accuracy training set: 0.501
Accuracy test set: 0.497


* As seen above, training set after SMOTE = 50% of the instances in the training set were predicted correctly by the dummy classifier

* As seen above, Test set after SMOTE= 49% of the instances in the test set were predicted correctly by the dummy classifier

## Binary classification

### LogisticRegression Class

In [49]:
#Creating an instance of LogisticRegression
LogR = LogisticRegression()

#Fitting the model on the training data
LogR.fit(x_train, y_train)

LogR_y_train_prediction = LogR.predict(x_train)
LogR_y_test_prediction = LogR.predict(x_test)

#Printing accuracy of the LogisticRegression model on both training set and the test set
print(f"Accuracy training set: {accuracy_score(y_train, LogR_y_train_prediction):.3f}")
print(f"Accuracy test set: {accuracy_score(y_test, LogR_y_test_prediction):.3f}")


Accuracy training set: 0.667
Accuracy test set: 0.667


* Accuracy on training set (0.674): Approximately 67.4% of the instances in training set were correctly predicted by the logistic regression model.

* Accuracy on test set (0.671): Approximately 67.1% of the instances test were correctly predicted by the logistic regression model.

* This is higher accuracy than the dummy classificer of 0.48 and 0.51

In [50]:
#Crossvalidating with K-folds, for creating the most optimal machine learning model.
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV

LogR_Cv = LogisticRegressionCV(

#cv: Stratified K-Folds. The default cross-validation generator used.
cv=10,
    
#random state: random seed is set to 42, for repoducibillity
random_state=42,

#max_iter: Maximum number of iterations of the optimization algorithm.
max_iter=10000)

LogR_Cv.fit(x_train, y_train)

print(LogR_Cv.C_)

#Defining
LogR_Cv_y_train_prediction = LogR_Cv.predict(x_train)
LogR_Cv_y_test_prediction = LogR_Cv.predict(x_test)

#Printing accuracy of the LogisticRegression model on both training set and the test set after Crossvalidating
print(f"Accuracy training set: {accuracy_score(y_train, LogR_Cv_y_train_prediction):.3f}")
print(f"Accuracy test set: {accuracy_score(y_test, LogR_Cv_y_test_prediction):.3f}")


[0.35938137]
Accuracy training set: 0.672
Accuracy test set: 0.671


Interpret:

* The model performs reasonably well on both the training and test sets, with accuracies around 67%. The similarity between the training and test accuracies suggests that the model is generalizing well to new, unseen data.

* The high value of 'C' indicates that the model is allowed to be more complex. However higher complexity might lead to overfitting.

**We did not gain any accuracy on the training and test set than before, after crossvalidating**




In [51]:
#Trying to increase accuracy with SickitGrid
LogR_Gr = LogisticRegression()

#Applying new parameters, where C is set to instances of values, instead of default 1
#Iterations is set to maximum 200.000 instead of 10000
LogR_Gr_Parameters = {
    'C': [1, 10, 100, 1000],
    'max_iter':[150000],
    'random_state':[42]
}    

#GridSearch
LogR_Grid = GridSearchCV(estimator=LogR_Gr, param_grid=LogR_Gr_Parameters, cv = 10, verbose=3)
    
#Fitting Data and assessing the score

LogR_Grid.fit(x_train, y_train)
LogR_Grid.best_score_, LogR_Grid.best_params_





Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV 1/10] END C=1, max_iter=150000, random_state=42;, score=0.664 total time=   0.4s
[CV 2/10] END C=1, max_iter=150000, random_state=42;, score=0.672 total time=   0.3s
[CV 3/10] END C=1, max_iter=150000, random_state=42;, score=0.666 total time=   0.4s
[CV 4/10] END C=1, max_iter=150000, random_state=42;, score=0.667 total time=   0.3s
[CV 5/10] END C=1, max_iter=150000, random_state=42;, score=0.673 total time=   0.5s
[CV 6/10] END C=1, max_iter=150000, random_state=42;, score=0.665 total time=   0.4s
[CV 7/10] END C=1, max_iter=150000, random_state=42;, score=0.673 total time=   0.6s
[CV 8/10] END C=1, max_iter=150000, random_state=42;, score=0.669 total time=   0.4s
[CV 9/10] END C=1, max_iter=150000, random_state=42;, score=0.673 total time=   0.3s
[CV 10/10] END C=1, max_iter=150000, random_state=42;, score=0.665 total time=   0.4s
[CV 1/10] END C=10, max_iter=150000, random_state=42;, score=0.664 total time=   0.4s
[C

(0.6686648238875459, {'C': 1, 'max_iter': 150000, 'random_state': 42})

In [52]:
LogR_grid_prediction = LogR_Grid.predict(x_test)

# Printing result of the grid search
print("Accuracy training set: {:.3f}".format(LogR_Grid.score(x_train, y_train)))
print("Accuracy test set: {:.3f}".format(LogR_Grid.score(x_test, y_test)))

Accuracy training set: 0.667
Accuracy test set: 0.667


GridSearchCV() Was used to improve the accuracy of the model. Cross validation was applied, with different C values. The new result was as close, as before using GridSearch() function, so it does not show which one was better than the other **(LogisticRegressionCV vs GridSearchCV)**


**Comparing Logistic Regression Report**

In [53]:
print(classification_report(y_test, LogR_y_test_prediction))

              precision    recall  f1-score   support

           0       0.67      0.68      0.67     33492
           1       0.67      0.66      0.66     33141

    accuracy                           0.67     66633
   macro avg       0.67      0.67      0.67     66633
weighted avg       0.67      0.67      0.67     66633



In [54]:
print(classification_report(y_test, LogR_Cv_y_test_prediction))

              precision    recall  f1-score   support

           0       0.67      0.69      0.68     33492
           1       0.67      0.66      0.67     33141

    accuracy                           0.67     66633
   macro avg       0.67      0.67      0.67     66633
weighted avg       0.67      0.67      0.67     66633



In [55]:
print(classification_report(y_test, LogR_grid_prediction))

              precision    recall  f1-score   support

           0       0.67      0.68      0.67     33492
           1       0.67      0.66      0.66     33141

    accuracy                           0.67     66633
   macro avg       0.67      0.67      0.67     66633
weighted avg       0.67      0.67      0.67     66633



**Dummy report**

In [56]:
print(classification_report(y_test, DummyClass_y_prediction, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     33492
           1       0.50      1.00      0.66     33141

    accuracy                           0.50     66633
   macro avg       0.25      0.50      0.33     66633
weighted avg       0.25      0.50      0.33     66633



**Conclusion on comparement CHANGE**
* Model was a little better at predicting defaults for LogisticRegression (LogR Variable) 

* Precision: Score is 0.68 for defaults and 0.67 for success
* Recall: For the predicted defaults, the ratio is worse than success. Model is better at predicting success rather than defaults.
* F-1 Score: Combination of Precision and recall

Conclusion: Theres indications that the model is slightly better at predicting success. The difference i s very minial. The logistic regression does a rather decent job --> But other models still needs to be assessed, in order to conclude which model is the best for implementation


## Creating Decision Tree
