### Libraries and Modules

In [1]:
from Input_Values.input_vars import loan_input_data_location, geojson_input_data_location
from Read_Data.read_data_files import Read_Data
from Data_Cleaning.clean_data import Clean_Data
from Feature_Engineering.create_features import Create_Features
from EDA.eda_outputs import EDA_Outputs
from XGBoost_Regression_Model.model_data_transformation import Model_Data_Transformations
from XGBoost_Regression_Model.data_split import Data_Split
from XGBoost_Regression_Model.create_model import Create_Model
from Residual_Analysis.residual_analysis import Residual_Analysis

### Calling Modules

In [2]:
read_data=Read_Data()
clean_data=Clean_Data()
create_features=Create_Features()
eda_outputs=EDA_Outputs()
model_data_transformations=Model_Data_Transformations()
data_split=Data_Split()
create_model=Create_Model()
residual_analysis=Residual_Analysis()

### Read In Data

In [3]:
original_data_df=read_data.loan_data_to_pandas_df(loan_input_data_location=loan_input_data_location)
geojson_data=read_data.read_in_geojson_data(geojson_input_data_location=geojson_input_data_location)
original_data_df.head()

Unnamed: 0,LoanNumber,DateApproved,SBAOfficeCode,ProcessingMethod,BorrowerName,BorrowerAddress,BorrowerCity,BorrowerState,BorrowerZip,LoanStatusDate,...,BusinessType,OriginatingLenderLocationID,OriginatingLender,OriginatingLenderCity,OriginatingLenderState,Gender,Veteran,NonProfit,ForgivenessAmount,ForgivenessDate
0,9547507704,2020-05-01,464,PPP,"SUMTER COATINGS, INC.",2410 Highway 15 South,Sumter,,29150-9662,2020-12-18,...,Corporation,19248,Synovus Bank,COLUMBUS,GA,Unanswered,Unanswered,,773553.37,2020-11-20
1,9777677704,2020-05-01,464,PPP,"PLEASANT PLACES, INC.",7684 Southrail Road,North Charleston,,29420-9000,2021-09-28,...,Sole Proprietorship,19248,Synovus Bank,COLUMBUS,GA,Male Owned,Non-Veteran,,746336.24,2021-08-12
2,5791407702,2020-05-01,1013,PPP,BOYER CHILDREN'S CLINIC,1850 BOYER AVE E,SEATTLE,,98112-2922,2021-03-17,...,Non-Profit Organization,9551,"Bank of America, National Association",CHARLOTTE,NC,Unanswered,Unanswered,Y,696677.49,2021-02-10
3,6223567700,2020-05-01,920,PPP,KIRTLEY CONSTRUCTION INC,1661 MARTIN RANCH RD,SAN BERNARDINO,,92407-1740,2021-10-16,...,Corporation,9551,"Bank of America, National Association",CHARLOTTE,NC,Male Owned,Non-Veteran,,395264.11,2021-09-10
4,9662437702,2020-05-01,101,PPP,AERO BOX LLC,,,,,2021-08-17,...,,57328,The Huntington National Bank,COLUMBUS,OH,Unanswered,Unanswered,,370819.35,2021-04-08


### Removing Certain Rows Based on Columns

In [4]:
print(f'The original dataframe has the following shape: {original_data_df.shape}')
remove_nulls_df=clean_data.remove_nulls_based_on_columns(data=original_data_df)
remove_territories_df=clean_data.remove_usa_territories(data=remove_nulls_df)
print(f'After removing Null values the new dataframe has the following shape: {remove_territories_df.shape}')

The original dataframe has the following shape: (968531, 53)
After removing Null values the new dataframe has the following shape: (935124, 53)


### Feature Engineering

In [5]:
# Industry Mapping
clean_data_df=create_features.mapping_industries(data=remove_territories_df)

# Number of Loans
clean_data_df=create_features.number_of_loans(data=clean_data_df)

# Amount of Loan Forgiven
clean_data_df=create_features.amount_of_loan_forgiven(data=clean_data_df)

# Revised Loan Approval
clean_data_df=create_features.revised_loan_amount(data=clean_data_df)

# Days With Loan
clean_data_df=create_features.days_with_loan_approval(data=clean_data_df)

# Zip5 Feature
clean_data_df=create_features.create_zip5(data=clean_data_df)

### EDA (All EDA Images are Output in the folder --> Plots_Storage/EDA_Plots)

In [None]:
eda_outputs.eda_plots_missing_values_heatmap(data=clean_data_df)
eda_outputs.eda_correlation_heatmap(data=clean_data_df)
eda_outputs.eda_mapping_industries_and_count(data=clean_data_df)
eda_outputs.eda_spread_by_gender(data=clean_data_df)
eda_outputs.eda_average_loan_amount_by_industry_and_gender(data=clean_data_df)
eda_outputs.eda_average_loan_amount_by_lmi_indicator_by_industry(data=clean_data_df)
eda_outputs.eda_average_loan_amount_by_hubzone_indicator_by_industry(data=clean_data_df)
eda_outputs.eda_state_loan_count(data=clean_data_df)
eda_outputs.eda_state_loan_avg_amount(data=clean_data_df)
eda_outputs.eda_zip_loan_count(data=clean_data_df, counties=geojson_data)
eda_outputs.eda_zip_loan_avg(data=clean_data_df, counties=geojson_data)
eda_outputs.eda_time_series_gender_loan_amount(data=clean_data_df)
eda_outputs.eda_time_series_loan_amount(data=clean_data_df)
eda_outputs.eda_spend_amount_per_category(data=clean_data_df)
eda_outputs.eda_daily_spend_per_indsutry(data=clean_data_df)

### Selecting Modeling Features

In [6]:
ml_dataset_df=model_data_transformations.select_and_impute_features(data=clean_data_df)

### Sklearn Pipeline For Modeling Features

In [7]:
ml_sparse_df=model_data_transformations.sklearn_data_pipelines(data=ml_dataset_df)

### Splitting Into X and Y Variables

In [8]:
x_y_variables=data_split.split_data_x_y(data=ml_sparse_df)
X=x_y_variables[0]
y=x_y_variables[1]

### XGBoost Regression Model

In [11]:
best_xgboost_model=create_model.create_xgboost_model(X=X, y=y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




### Standardized Residuals

In [15]:
standardized_residuals=residual_analysis.analyze_residual_data(load_model=True, 
                                                               X=X, 
                                                               y=y, 
                                                               actual_model=None)

# standardized_residuals=residual_analysis.analyze_residual_data(load_model=False, 
#                                                                X=X, 
#                                                                y=y, 
#                                                                actual_model=best_xgboost_model)

### Calculating Fraud Based on Residuals