### Libraries and Modules

In [1]:
from Input_Values.input_vars import loan_input_data_location, geojson_input_data_location, \
                                    eda_plot_location

from Read_Data.read_data_files import Read_Data
from Data_Cleaning.clean_data import Clean_Data
from Feature_Engineering.create_features import Create_Features
from EDA.eda_outputs import EDA_Outputs
from XGBoost_Regression_Model.model_data_transformation import Model_Data_Transformations
from XGBoost_Regression_Model.data_split import Data_Split
from XGBoost_Regression_Model.create_model import Create_Model
from Model_Feature_Importance.feature_importance import Model_Feature_Importance
from Residual_Analysis.residual_analysis import Residual_Analysis
from Residual_Analysis.standardized_residuals_fraud import Fraud_Detection
from Create_Final_DF.final_df import Final_DF

### Calling Modules

In [2]:
read_data=Read_Data()
clean_data=Clean_Data()
create_features=Create_Features()
eda_outputs=EDA_Outputs(plot_location=eda_plot_location)
model_data_transformations=Model_Data_Transformations()
data_split=Data_Split()
create_model=Create_Model()
model_feature_importance=Model_Feature_Importance()
residual_analysis=Residual_Analysis()
fraud_detection=Fraud_Detection()
final_df=Final_DF()

### Read In Data

In [3]:
original_data_df=read_data.loan_data_to_pandas_df(loan_input_data_location=loan_input_data_location)
geojson_data=read_data.read_in_geojson_data(geojson_input_data_location=geojson_input_data_location)
original_data_df.head()

Unnamed: 0,LoanNumber,DateApproved,SBAOfficeCode,ProcessingMethod,BorrowerName,BorrowerAddress,BorrowerCity,BorrowerState,BorrowerZip,LoanStatusDate,...,BusinessType,OriginatingLenderLocationID,OriginatingLender,OriginatingLenderCity,OriginatingLenderState,Gender,Veteran,NonProfit,ForgivenessAmount,ForgivenessDate
0,9547507704,2020-05-01,464,PPP,"SUMTER COATINGS, INC.",2410 Highway 15 South,Sumter,,29150-9662,2020-12-18,...,Corporation,19248,Synovus Bank,COLUMBUS,GA,Unanswered,Unanswered,,773553.37,2020-11-20
1,9777677704,2020-05-01,464,PPP,"PLEASANT PLACES, INC.",7684 Southrail Road,North Charleston,,29420-9000,2021-09-28,...,Sole Proprietorship,19248,Synovus Bank,COLUMBUS,GA,Male Owned,Non-Veteran,,746336.24,2021-08-12
2,5791407702,2020-05-01,1013,PPP,BOYER CHILDREN'S CLINIC,1850 BOYER AVE E,SEATTLE,,98112-2922,2021-03-17,...,Non-Profit Organization,9551,"Bank of America, National Association",CHARLOTTE,NC,Unanswered,Unanswered,Y,696677.49,2021-02-10
3,6223567700,2020-05-01,920,PPP,KIRTLEY CONSTRUCTION INC,1661 MARTIN RANCH RD,SAN BERNARDINO,,92407-1740,2021-10-16,...,Corporation,9551,"Bank of America, National Association",CHARLOTTE,NC,Male Owned,Non-Veteran,,395264.11,2021-09-10
4,9662437702,2020-05-01,101,PPP,AERO BOX LLC,,,,,2021-08-17,...,,57328,The Huntington National Bank,COLUMBUS,OH,Unanswered,Unanswered,,370819.35,2021-04-08


### Removing Certain Rows Based on Columns

In [4]:
print(f'The original dataframe has the following shape: {original_data_df.shape}')
remove_nulls_df=clean_data.remove_nulls_based_on_columns(data=original_data_df)
remove_territories_df=clean_data.remove_usa_territories(data=remove_nulls_df)
print(f'After removing Null values the new dataframe has the following shape: {remove_territories_df.shape}')

The original dataframe has the following shape: (968531, 53)
After removing Null values the new dataframe has the following shape: (935126, 53)


### Feature Engineering

In [5]:
# Industry Mapping
clean_data_df=create_features.mapping_industries(data=remove_territories_df)

# Number of Loans
clean_data_df=create_features.number_of_loans(data=clean_data_df)

# Amount of Loan Forgiven
clean_data_df=create_features.amount_of_loan_forgiven(data=clean_data_df)

# Revised Loan Approval
clean_data_df=create_features.revised_loan_amount(data=clean_data_df)

# Days With Loan
clean_data_df=create_features.days_with_loan_approval(data=clean_data_df)

# ZIP5 Borrower Feature
clean_data_df=create_features.create_borrower_zip5(data=clean_data_df)

# ZIP5 Lender Feature
clean_data_df=create_features.create_lender_zip5(data=clean_data_df)

### EDA

In [6]:
eda_outputs.eda_plots_missing_values_heatmap(data=clean_data_df)
eda_outputs.eda_correlation_heatmap(data=clean_data_df)
eda_outputs.eda_mapping_industries_and_count(data=clean_data_df)
eda_outputs.eda_spread_by_gender(data=clean_data_df)
eda_outputs.eda_average_loan_amount_by_industry_and_gender(data=clean_data_df)
eda_outputs.eda_average_loan_amount_by_lmi_indicator_by_industry(data=clean_data_df)
eda_outputs.eda_average_loan_amount_by_hubzone_indicator_by_industry(data=clean_data_df)
eda_outputs.eda_state_loan_count(data=clean_data_df)
eda_outputs.eda_state_loan_avg_amount(data=clean_data_df)
eda_outputs.eda_zip_loan_count(data=clean_data_df, counties=geojson_data)
eda_outputs.eda_zip_loan_avg(data=clean_data_df, counties=geojson_data)
eda_outputs.eda_time_series_gender_loan_amount(data=clean_data_df)
eda_outputs.eda_time_series_loan_amount(data=clean_data_df)
eda_outputs.eda_spend_amount_per_category(data=clean_data_df)
eda_outputs.eda_daily_spend_per_indsutry(data=clean_data_df)

### Selecting Modeling Features

In [8]:
ml_dataset_df=model_data_transformations.select_and_impute_features(data=clean_data_df)
ml_dataset_df.head()

Unnamed: 0,LoanNumber,ProcessingMethod,Term,CurrentApprovalAmount,RuralUrbanIndicator,HubzoneIndicator,LMIIndicator,JobsReported,UTILITIES_PROCEED,PAYROLL_PROCEED,MORTGAGE_INTEREST_PROCEED,RENT_PROCEED,REFINANCE_EIDL_PROCEED,HEALTH_CARE_PROCEED,DEBT_INTEREST_PROCEED,Loan_Count,Loan_Amount_Owed,Revised_Loan_Amount,Days_With_Loan
0,5502308207,PPP,60,9538531.0,U,Y,Y,385.0,0.0,9538531.0,0.0,0.0,0.0,0.0,0.0,1,77353.57,-32866.0,307
1,6110847106,PPP,24,7666768.0,R,N,N,295.0,0.0,7666768.0,0.0,0.0,0.0,0.0,0.0,1,94556.81,0.0,455
2,4539098204,PPP,60,7398947.89,U,N,N,500.0,0.0,7398947.89,0.0,0.0,0.0,0.0,0.0,1,-2507517.52,0.0,685
3,5120868804,PPP,60,7223025.0,U,N,N,500.0,0.0,7223025.0,0.0,0.0,0.0,0.0,0.0,1,98945.55,0.0,514
4,6650277102,PPP,24,6528631.4,R,Y,N,439.0,0.0,5037890.65,0.0,0.0,0.0,1490740.75,0.0,1,69757.98,0.0,399


### Sklearn Pipeline For Modeling Features

In [9]:
ml_sparse_df=model_data_transformations.sklearn_data_pipelines(data=ml_dataset_df)
ml_sparse_df.head()

Unnamed: 0,categorical__ProcessingMethod_PPP,categorical__ProcessingMethod_PPS,categorical__RuralUrbanIndicator_R,categorical__RuralUrbanIndicator_U,categorical__HubzoneIndicator_N,categorical__HubzoneIndicator_Y,categorical__LMIIndicator_N,categorical__LMIIndicator_Y,numerical__Term,numerical__JobsReported,...,numerical__MORTGAGE_INTEREST_PROCEED,numerical__RENT_PROCEED,numerical__REFINANCE_EIDL_PROCEED,numerical__HEALTH_CARE_PROCEED,numerical__DEBT_INTEREST_PROCEED,numerical__Loan_Count,numerical__Loan_Amount_Owed,numerical__Revised_Loan_Amount,numerical__Days_With_Loan,remainder__LoanNumber
0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.384442,4.96266,...,-0.063464,-0.146762,-0.015551,-0.102478,-0.045488,-0.376526,1.025054,-0.302056,-0.322718,5502308207
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,-0.718713,3.62216,...,-0.063464,-0.146762,-0.015551,-0.102478,-0.045488,-0.376526,1.249415,0.015724,0.983125,6110847106
2,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.384442,6.675521,...,-0.063464,-0.146762,-0.015551,-0.102478,-0.045488,-0.376526,-32.686285,0.015724,3.012475,4539098204
3,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.384442,6.675521,...,-0.063464,-0.146762,-0.015551,-0.102478,-0.045488,-0.376526,1.306652,0.015724,1.503697,5120868804
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-0.718713,5.76696,...,-0.063464,-0.146762,-0.015551,54.315012,-0.045488,-0.376526,0.925994,0.015724,0.489022,6650277102


### Splitting Into X and Y Variables

In [None]:
x_y_variables=data_split.split_data_x_y(data=ml_sparse_df)
X=x_y_variables[0]
y=x_y_variables[1]

### XGBoost Regression Model

In [None]:
best_xgboost_model=create_model.create_xgboost_model(X=X, y=y)

### Model Feature Importance

In [None]:
model_feature_importance.xgboost_feature_importance()

### Standardized Residuals

In [None]:
standardized_residuals_df=residual_analysis.analyze_residual_data(X=X, 
                                                                  y=y,
                                                                  clean_data_df=clean_data_df)

### Calculating Fraud Based on Residuals

In [None]:
fraud_df=fraud_detection.standardized_residuals_percentiles(data=standardized_residuals_df)

### Creating Final DataFrame

In [None]:
final_df_output=final_df.create_final_dataframe(clean_data_df=clean_data_df, fraud_df=fraud_df)
final_df_output.head()