# SBAloans Big Data Modelling

In [1]:
#Checking file placement
import os 
os.getcwd()

'C:\\Users\\wowan\\OneDrive\\Skrivebord Bærebar\\BigData Exam\\bigdata'

## Import

In [2]:
#Importing modules and the processed .csv file, as done in the preprocessing phase.

# Data handling
import pandas as pd
import numpy as np

#training and test set
from sklearn.model_selection import train_test_split

# Evaluating classifiers
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

#balancing dataset with SMOTE
from imblearn.over_sampling import SMOTE 

#Dummy classifier
from sklearn.dummy import DummyClassifier

# Accuracy score for training- and test set
from sklearn.metrics import accuracy_score


# Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# For Scaling the dataset
from sklearn.preprocessing import MinMaxScaler

#Logistic regression // Cross validation
from sklearn.linear_model import LogisticRegressionCV

# Visulaization of data
from sklearn import tree
import seaborn as sns
import matplotlib.pyplot as plt

#Classification models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


In [3]:
#Importing the processed .csv file

SBAloan_df = pd.read_csv('SBAnational_processed.csv', sep=',', dtype='unicode')

#Displaying data table
SBAloan_df.head()

Unnamed: 0.1,Unnamed: 0,State,BankState,NAICS,ApprovalFY,Term,NoEmp,NewExist,FranchiseCode,UrbanRural,LowDoc,DisbursementGross,MIS_Status
0,143841,20,25,45,2007,60,6,1,0,1,0,79500,0
1,143842,14,53,44,2007,60,1,2,0,1,0,35000,0
2,143845,14,53,44,2007,78,1,2,0,1,0,39737,0
3,143846,1,29,42,2007,58,20,1,0,1,0,100000,1
4,143851,45,2,51,2007,13,3,1,0,2,0,50000,0


### Defining target variable and baseline

In [4]:
#We start by creating a new DataFrame 'X' by selecting the columsn from the other DataFrame SBAloan_df
#We are leaving out Variable 'ApprovalFY' as it is not relevant

#Defining x
x = SBAloan_df[['State', 'BankState', 'NAICS', 'Term', 'NoEmp', 'NewExist', 'FranchiseCode', 'UrbanRural', 'LowDoc', 'DisbursementGross']]

#Defining Y (target variable) for machine learning
y = SBAloan_df['MIS_Status']

#Showing attributes
x.columns

Index(['State', 'BankState', 'NAICS', 'Term', 'NoEmp', 'NewExist',
       'FranchiseCode', 'UrbanRural', 'LowDoc', 'DisbursementGross'],
      dtype='object')

### Creating training- and test split

In [5]:
#Creating training- and test split of the data, with function from sklearn.model library

#splitting code intro training and test set, with 75% of data used for training and 25% for testing
#random seed is set to 42, for repoducibillity
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

x_train
x_test

Unnamed: 0,State,BankState,NAICS,Term,NoEmp,NewExist,FranchiseCode,UrbanRural,LowDoc,DisbursementGross
83743,21,29,23,144,1,1,0,2,0,49965
123822,7,16,42,36,3,1,0,2,0,689737
161120,6,17,11,12,5,2,0,1,0,3000000
131852,9,8,54,120,6,1,0,1,0,615000
74928,9,36,81,60,0,2,0,1,0,40000
...,...,...,...,...,...,...,...,...,...,...
137604,18,12,62,114,2,2,0,1,0,10000
31955,15,25,54,23,1,2,0,1,0,10000
105785,37,50,23,50,2,1,0,2,0,12800
122604,18,21,54,60,1,2,1,1,0,92609


###  Baselinemodel and evaluating accuracy of training- and test set

In [6]:
#Creating Dummy classificer, that trains on the training data, for use to predict target variable values on the test data.
#Predicted values are stored in DummyClass_y_prediction

DummyClass = DummyClassifier()
DummyClass.fit(x_train, y_train)
DummyClass_y_prediction = DummyClass.predict(x_test)

#Predicted values are stored in DummyClass_y_prediction



In [7]:
#Printing accuracy of the model on both training set and the test set.
#Accuracy indicates the proportion of correctly predicted instances in the model.

DummyClass_y_train_prediction = DummyClass.predict(x_train)
DummyClass_y_test_prediction = DummyClass.predict(x_test)

print(f"Accuracy training set: {accuracy_score(y_train, DummyClass_y_train_prediction):.3f}")
print(f"Accuracy test set: {accuracy_score(y_test, DummyClass_y_test_prediction):.3f}")


Accuracy training set: 0.675
Accuracy test set: 0.681


* As seen above, training set = 67,5% of the instances in the training set were predicted correctly by the dummy classifier

* As seen above, Test set = 68.1% of the instances in the test set were predicted correctly by the dummy classifier


In [8]:
#Now checking the balance of the data set
print(y.dtypes)

# Converting any NaN to numeric since dtype was an object
y = pd.to_numeric(y, errors='coerce')

# Calculate the sum of numeric values and divide by the total number of element
result = sum(y) / len(y)

print(result)

object
0.3234005739108184


* The result shows, that 32.34%  of the instances in the dataset belong to positive class

* This indicates that that the dataset needs to be balance, since it could impact the performance of the model


In [9]:
#First creating confusion_matrix, to compue matrix for classification model
conf_matrix = confusion_matrix(y_test, DummyClass_y_prediction)
print(conf_matrix)


#[TN,    FP]
#[FN     TP]

#High number of True Positives and True Negatives = good performance
#low number of False Positives and False Negatives = good performance


[[27958     0]
 [13077     0]]


As seen:

* True Negatives (TN): 27958 instances were correctly predicted as the negative class (0).

* False Negatives (FN): 13077 instances were incorrectly predicted as the negative class (0) when they were actually the positive class (1).



### Balancing dataset with SMOTE

In [10]:
smote = SMOTE(random_state=42)
x_SMOTE, y_SMOTE = smote.fit_resample(x,y)

result = sum(y_SMOTE)/len(y_SMOTE)
print(result)

0.5


SMOTE was applied to oversample, by generating synthetic examples.
the result of '0.5' suggest that the class distribution is now balanced
* With approxmitly equal number of instances for both positive and negative classes.
* This helps prevent the model from being biased toward the majority class

In [11]:
#Now where SMOTE has balanced the dataset, new training and data set, will be made

x_train, x_test, y_train, y_test = train_test_split(x_SMOTE, y_SMOTE, test_size=0.25, random_state=42)

x_train

Unnamed: 0,State,BankState,NAICS,Term,NoEmp,NewExist,FranchiseCode,UrbanRural,LowDoc,DisbursementGross
184328,3.793375,42.140376,66.570188,56.095031,2.904969,2.0,0.0,1.0,0.0,131004.524843
93982,1.0,12.0,81.0,64.0,3.0,1.0,0.0,1.0,0.0,35000.0
68511,1.0,29.0,81.0,72.0,8.0,1.0,0.0,1.0,0.0,100000.0
182165,20.0,25.0,22.993229,61.995486,4.002821,1.0,0.0,1.000564,0.0,140000.0
11140,2.0,26.0,71.0,72.0,1.0,2.0,0.0,1.0,0.0,48712.0
...,...,...,...,...,...,...,...,...,...,...
119879,10.0,24.0,44.0,84.0,2.0,2.0,0.0,2.0,0.0,58500.0
103694,30.0,20.0,54.0,36.0,4.0,1.0,0.0,2.0,0.0,25000.0
131932,11.0,39.0,44.0,40.0,4.0,1.0,0.0,2.0,0.0,35000.0
146867,8.0,22.0,62.0,84.0,5.0,1.0,0.0,1.0,0.0,70000.0


In [18]:
#removing decimals
x_train = x_train.astype(int)

#displaying data again
x_train


Unnamed: 0,State,BankState,NAICS,Term,NoEmp,NewExist,FranchiseCode,UrbanRural,LowDoc,DisbursementGross
184328,3,42,66,56,2,2,0,1,0,131004
93982,1,12,81,64,3,1,0,1,0,35000
68511,1,29,81,72,8,1,0,1,0,100000
182165,20,25,22,61,4,1,0,1,0,140000
11140,2,26,71,72,1,2,0,1,0,48712
...,...,...,...,...,...,...,...,...,...,...
119879,10,24,44,84,2,2,0,2,0,58500
103694,30,20,54,36,4,1,0,2,0,25000
131932,11,39,44,40,4,1,0,2,0,35000
146867,8,22,62,84,5,1,0,1,0,70000


In [13]:
#Creating Dummy classificer, that trains on the training data with new SMOTE dataset
DummyClass = DummyClassifier()
DummyClass.fit(x_train, y_train)
DummyClass_y_prediction = DummyClass.predict(x_test)

#Printing accuracy of the model on both training set and the test set after applying SMOTE
DummyClass_y_train_prediction = DummyClass.predict(x_train)
DummyClass_y_test_prediction = DummyClass.predict(x_test)

print(f"Accuracy training set: {accuracy_score(y_train, DummyClass_y_train_prediction):.3f}")
print(f"Accuracy test set: {accuracy_score(y_test, DummyClass_y_test_prediction):.3f}")


Accuracy training set: 0.501
Accuracy test set: 0.498


* As seen above, training set after SMOTE = 50% of the instances in the training set were predicted correctly by the dummy classifier

* As seen above, Test set after SMOTE= 49% of the instances in the test set were predicted correctly by the dummy classifier

## Binary classification

### LogisticRegression Class

In [19]:
#Creating an instance of LogisticRegression
LogR = LogisticRegression()

#Fitting the model on the training data
LogR.fit(x_train, y_train)

LogR_y_train_prediction = LogR.predict(x_train)
LogR_y_test_prediction = LogR.predict(x_test)

#Printing accuracy of the LogisticRegression model on both training set and the test set
print(f"Accuracy training set: {accuracy_score(y_train, LogR_y_train_prediction):.3f}")
print(f"Accuracy test set: {accuracy_score(y_test, LogR_y_test_prediction):.3f}")


Accuracy training set: 0.674
Accuracy test set: 0.671


* Accuracy on training set (0.674): Approximately 67.4% of the instances in training set were correctly predicted by the logistic regression model.

* Accuracy on test set (0.671): Approximately 67.1% of the instances test were correctly predicted by the logistic regression model.

In [40]:
#Crossvalidating with K-folds, for creating the most optimal machine learning model.

LogR_Cv = LogisticRegressionCV(

#cv: Stratified K-Folds. The default cross-validation generator used.
cv=10,

#random state: random seed is set to 42, for repoducibillity
random_state=42,

#max_iter: Maximum number of iterations of the optimization algorithm.
max_iter=30000)

LogR_Cv.fit(x_train, y_train)

print(LogR_Cv.C_)

#Defining
LogR_Cv_y_train_prediction = LogR_Cv.predict(x_train)
LogR_Cv_y_test_prediction = LogR_Cv.predict(x_test)

#Printing accuracy of the LogisticRegression model on both training set and the test set after Crossvalidating
print(f"Accuracy training set: {accuracy_score(y_train, LogR_Cv_y_train_prediction):.3f}")
print(f"Accuracy test set: {accuracy_score(y_test, LogR_Cv_y_test_prediction):.3f}")


[10000.]
Accuracy training set: 0.673
Accuracy test set: 0.670


Interpret:

* The model performs reasonably well on both the training and test sets, with accuracies around 67%. The similarity between the training and test accuracies suggests that the model is generalizing well to new, unseen data.

* The high value of 'C' indicates that the model is allowed to be more complex. However higher complexity might lead to overfitting.