### Libraries and Modules

In [1]:
from Input_Values.input_vars import loan_input_data_location, geojson_input_data_location
from Read_Data.read_data_files import Read_Data
from Data_Cleaning.clean_data import Clean_Data
from Feature_Engineering.create_features import Create_Features
from EDA.eda_outputs import EDA_Outputs
from XGBoost_Regression_Model.model_data_transformation import Model_Data_Transformations
from XGBoost_Regression_Model.data_split import Data_Split
from XGBoost_Regression_Model.create_model import Create_Model
from Model_Feature_Importance.feature_importance import Model_Feature_Importance
from Residual_Analysis.residual_analysis import Residual_Analysis
from Residual_Analysis.standardized_residuals_fraud import Fraud_Detection
from Create_Final_DF.final_df import Final_DF

### Calling Modules

In [2]:
read_data=Read_Data()
clean_data=Clean_Data()
create_features=Create_Features()
eda_outputs=EDA_Outputs()
model_data_transformations=Model_Data_Transformations()
data_split=Data_Split()
create_model=Create_Model()
model_feature_importance=Model_Feature_Importance()
residual_analysis=Residual_Analysis()
fraud_detection=Fraud_Detection()
final_df=Final_DF()

### Read In Data

In [3]:
original_data_df=read_data.loan_data_to_pandas_df(loan_input_data_location=loan_input_data_location)
geojson_data=read_data.read_in_geojson_data(geojson_input_data_location=geojson_input_data_location)
original_data_df.head()

Unnamed: 0,LoanNumber,DateApproved,SBAOfficeCode,ProcessingMethod,BorrowerName,BorrowerAddress,BorrowerCity,BorrowerState,BorrowerZip,LoanStatusDate,...,BusinessType,OriginatingLenderLocationID,OriginatingLender,OriginatingLenderCity,OriginatingLenderState,Gender,Veteran,NonProfit,ForgivenessAmount,ForgivenessDate
0,9547507704,2020-05-01,464,PPP,"SUMTER COATINGS, INC.",2410 Highway 15 South,Sumter,,29150-9662,2020-12-18,...,Corporation,19248,Synovus Bank,COLUMBUS,GA,Unanswered,Unanswered,,773553.37,2020-11-20
1,9777677704,2020-05-01,464,PPP,"PLEASANT PLACES, INC.",7684 Southrail Road,North Charleston,,29420-9000,2021-09-28,...,Sole Proprietorship,19248,Synovus Bank,COLUMBUS,GA,Male Owned,Non-Veteran,,746336.24,2021-08-12
2,5791407702,2020-05-01,1013,PPP,BOYER CHILDREN'S CLINIC,1850 BOYER AVE E,SEATTLE,,98112-2922,2021-03-17,...,Non-Profit Organization,9551,"Bank of America, National Association",CHARLOTTE,NC,Unanswered,Unanswered,Y,696677.49,2021-02-10
3,6223567700,2020-05-01,920,PPP,KIRTLEY CONSTRUCTION INC,1661 MARTIN RANCH RD,SAN BERNARDINO,,92407-1740,2021-10-16,...,Corporation,9551,"Bank of America, National Association",CHARLOTTE,NC,Male Owned,Non-Veteran,,395264.11,2021-09-10
4,9662437702,2020-05-01,101,PPP,AERO BOX LLC,,,,,2021-08-17,...,,57328,The Huntington National Bank,COLUMBUS,OH,Unanswered,Unanswered,,370819.35,2021-04-08


### Removing Certain Rows Based on Columns

In [4]:
print(f'The original dataframe has the following shape: {original_data_df.shape}')
remove_nulls_df=clean_data.remove_nulls_based_on_columns(data=original_data_df)
remove_territories_df=clean_data.remove_usa_territories(data=remove_nulls_df)
print(f'After removing Null values the new dataframe has the following shape: {remove_territories_df.shape}')

The original dataframe has the following shape: (968531, 53)
After removing Null values the new dataframe has the following shape: (935126, 53)


### Feature Engineering

In [5]:
# Industry Mapping
clean_data_df=create_features.mapping_industries(data=remove_territories_df)

# Number of Loans
clean_data_df=create_features.number_of_loans(data=clean_data_df)

# Amount of Loan Forgiven
clean_data_df=create_features.amount_of_loan_forgiven(data=clean_data_df)

# Revised Loan Approval
clean_data_df=create_features.revised_loan_amount(data=clean_data_df)

# Days With Loan
clean_data_df=create_features.days_with_loan_approval(data=clean_data_df)

# ZIP5 Borrower Feature
clean_data_df=create_features.create_borrower_zip5(data=clean_data_df)

# ZIP5 Lender Feature
clean_data_df=create_features.create_lender_zip5(data=clean_data_df)

### EDA

In [6]:
eda_outputs.eda_plots_missing_values_heatmap(data=clean_data_df)
eda_outputs.eda_correlation_heatmap(data=clean_data_df)
eda_outputs.eda_mapping_industries_and_count(data=clean_data_df)
eda_outputs.eda_spread_by_gender(data=clean_data_df)
eda_outputs.eda_average_loan_amount_by_industry_and_gender(data=clean_data_df)
eda_outputs.eda_average_loan_amount_by_lmi_indicator_by_industry(data=clean_data_df)
eda_outputs.eda_average_loan_amount_by_hubzone_indicator_by_industry(data=clean_data_df)
eda_outputs.eda_state_loan_count(data=clean_data_df)
eda_outputs.eda_state_loan_avg_amount(data=clean_data_df)
eda_outputs.eda_zip_loan_count(data=clean_data_df, counties=geojson_data)
eda_outputs.eda_zip_loan_avg(data=clean_data_df, counties=geojson_data)
eda_outputs.eda_time_series_gender_loan_amount(data=clean_data_df)
eda_outputs.eda_time_series_loan_amount(data=clean_data_df)
eda_outputs.eda_spend_amount_per_category(data=clean_data_df)
eda_outputs.eda_daily_spend_per_indsutry(data=clean_data_df)

### Selecting Modeling Features

In [7]:
ml_dataset_df=model_data_transformations.select_and_impute_features(data=clean_data_df)

### Sklearn Pipeline For Modeling Features

In [8]:
ml_sparse_df=model_data_transformations.sklearn_data_pipelines(data=ml_dataset_df)

### Splitting Into X and Y Variables

In [9]:
x_y_variables=data_split.split_data_x_y(data=ml_sparse_df)
X=x_y_variables[0]
y=x_y_variables[1]

### XGBoost Regression Model

In [10]:
best_xgboost_model=create_model.create_xgboost_model(X=X, y=y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits



eval_metric is not saved in Scikit-Learn meta.



### Model Feature Importance

In [11]:
model_feature_importance.xgboost_feature_importance()

### Standardized Residuals

In [12]:
standardized_residuals_df=residual_analysis.analyze_residual_data(X=X, 
                                                                  y=y,
                                                                  clean_data_df=clean_data_df)

### Calculating Fraud Based on Residuals

In [13]:
fraud_df=fraud_detection.standardized_residuals_percentiles(data=standardized_residuals_df)

### Creating Final DataFrame

In [14]:
final_df_output=final_df.create_final_dataframe(clean_data_df=clean_data_df, fraud_df=fraud_df)
final_df_output.head()

Unnamed: 0,LoanNumber,DateApproved,BorrowerName,BorrowerCity,BorrowerState,Term,InitialApprovalAmount,CurrentApprovalAmount,ServicingLenderName,ServicingLenderCity,...,Revised_Loan_Amount,Days_With_Loan,Borrower_ZIP5,Servicing_Lender_ZIP5,Predicted_Loan_Amount_Owed,Actual_Loan_Amount_Owed,Residuals,standardized_residual,percentile,Fraud_Detection
0,5502308207,2020-08-08,"KAKIVIK ASSET MANAGEMENT, LLC",ANCHORAGE,AK,60,9571397.0,9538531.0,Northrim Bank,ANCHORAGE,...,-32866.0,307,99503,99503,5118.443359,77353.57,72235.13,1.9e-05,0.993723,Fraud
1,6110847106,2020-04-14,"ARCTIC SLOPE NATIVE ASSOCIATION, LTD.",BARROW,AK,24,7666768.0,7666768.0,"National Cooperative Bank, National Association",HILLSBORO,...,0.0,455,99723,45133,-6678.593262,94556.81,101235.4,2.6e-05,0.996344,Fraud
2,4539098204,2020-08-06,CORVUS AIRLINES INC,Anchorage,AK,60,7398947.89,7398947.89,Idaho First Bank,MCCALL,...,0.0,685,99502,83638,-621164.375,-2507517.52,-1886353.0,-0.00049,0.000185,Fraud
3,5120868804,2021-04-17,HOPE COMMUNITY RESOURCES INC.,Anchorage,AK,60,7223025.0,7223025.0,First National Bank Alaska,ANCHORAGE,...,0.0,514,99518,99503,-244771.453125,98945.55,343717.0,8.9e-05,0.999734,Fraud
4,6650277102,2020-04-14,SOUTH PENINSULA HOSPITAL INC,HOMER,AK,24,6528631.4,6528631.4,First National Bank Alaska,ANCHORAGE,...,0.0,399,99603,99503,50304.535156,69757.98,19453.44,5e-06,0.951551,Not Fraud
