In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from pandas import DataFrame

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [3]:
#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("charity_data.csv")
application_df.head()
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [4]:
# Find 'ASK_AMT' outliers
Q1 = np.percentile(application_df['ASK_AMT'], 25)
# 3st quartile
Q3 = np.percentile(application_df['ASK_AMT'], 75)
# IQR
IQR = Q3 - Q1
# Outlier Step
outlier_step = IQR * 1.5
low_outlier_step = Q1 - outlier_step
high_outlier_step = Q3 + outlier_step

In [5]:
# Add an 'Outlier' column
application_df['Outlier'] = np.where((application_df['ASK_AMT']< low_outlier_step) | (application_df['ASK_AMT']> high_outlier_step), 1 , 0)
application_df

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,Outlier
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1,0
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1,0
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1,0


In [6]:
# Drop rows based on previously calculated feature importance and 'ASK_AMT' outliers 
index_names = application_df[((application_df['CLASSIFICATION'] == 'C2000') |(application_df['CLASSIFICATION'] == 'C1000')
                              |(application_df['CLASSIFICATION'] == 'C1200') |(application_df['CLASSIFICATION'] == 'C1700')|(application_df['CLASSIFICATION'] == 'C5000')
                              |(application_df['CLASSIFICATION'] == 'C3000')
                              | (application_df['CLASSIFICATION'] == 'C2100')) & 
                            application_df['Outlier']== 1].index
#index_names
application_df.drop(index_names, inplace = True)

In [7]:
# Select columns to be be used as features based on previously calculated feature importance
newdf = application_df[['ASK_AMT', 'AFFILIATION', 'ORGANIZATION', "APPLICATION_TYPE", "CLASSIFICATION","INCOME_AMT", "USE_CASE", "Outlier", "IS_SUCCESSFUL" ]]
newdf_ = DataFrame(newdf, columns=['ASK_AMT', 'AFFILIATION', 'ORGANIZATION', "APPLICATION_TYPE", "CLASSIFICATION", "INCOME_AMT", "USE_CASE", "Outlier", "IS_SUCCESSFUL"])
newdf_

Unnamed: 0,ASK_AMT,AFFILIATION,ORGANIZATION,APPLICATION_TYPE,CLASSIFICATION,INCOME_AMT,USE_CASE,Outlier,IS_SUCCESSFUL
0,5000,Independent,Association,T10,C1000,0,ProductDev,0,1
2,5000,CompanySponsored,Association,T5,C3000,0,ProductDev,0,0
3,6692,CompanySponsored,Trust,T3,C2000,10000-24999,Preservation,0,1
5,5000,Independent,Trust,T3,C1200,0,Preservation,0,1
9,5000,CompanySponsored,Association,T5,C3000,0,ProductDev,0,0
...,...,...,...,...,...,...,...,...,...
34293,5000,CompanySponsored,Association,T3,C1000,0,Preservation,0,1
34294,5000,Independent,Association,T4,C1000,0,ProductDev,0,0
34295,5000,CompanySponsored,Association,T4,C3000,0,ProductDev,0,0
34296,5000,CompanySponsored,Association,T3,C2000,0,Preservation,0,0


In [8]:
newdf__dtypes = pd.DataFrame(newdf_.dtypes)

In [9]:
newdf__dtypes.index = newdf__dtypes.index.set_names(['Column_Name'])
newdf__dtypes.rename(columns = {0: "Data_Type"}, inplace=True)
newdf__dtypes.reset_index()

Unnamed: 0,Column_Name,Data_Type
0,ASK_AMT,int64
1,AFFILIATION,object
2,ORGANIZATION,object
3,APPLICATION_TYPE,object
4,CLASSIFICATION,object
5,INCOME_AMT,object
6,USE_CASE,object
7,Outlier,int32
8,IS_SUCCESSFUL,int64


In [10]:
df_string = newdf__dtypes[newdf__dtypes['Data_Type']=='object']
df_string

Unnamed: 0_level_0,Data_Type
Column_Name,Unnamed: 1_level_1
AFFILIATION,object
ORGANIZATION,object
APPLICATION_TYPE,object
CLASSIFICATION,object
INCOME_AMT,object
USE_CASE,object


In [11]:
# Create the training variables by converting the string values into numerical ones using the get_dummies() method.
newdf_ = pd.get_dummies(newdf_, columns=["AFFILIATION", "ORGANIZATION", "APPLICATION_TYPE", "CLASSIFICATION", "INCOME_AMT", "USE_CASE"])
newdf_.head()

Unnamed: 0,ASK_AMT,Outlier,IS_SUCCESSFUL,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,AFFILIATION_Other,AFFILIATION_Regional,ORGANIZATION_Association,...,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,USE_CASE_CommunityServ,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev
0,5000,0,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,5000,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,6692,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,5000,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,5000,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


# Split the Data into Training and Testing

In [12]:
# Create our features
X = newdf_.drop(columns=["IS_SUCCESSFUL","Outlier"])
# Create our target
y = newdf_["IS_SUCCESSFUL"]

In [13]:
X.describe()

Unnamed: 0,ASK_AMT,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,AFFILIATION_Other,AFFILIATION_Regional,ORGANIZATION_Association,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,...,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,USE_CASE_CommunityServ,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev
count,26594.0,26594.0,26594.0,26594.0,26594.0,26594.0,26594.0,26594.0,26594.0,26594.0,...,26594.0,26594.0,26594.0,26594.0,26594.0,26594.0,26594.0,26594.0,26594.0,26594.0
mean,119414.6,0.518087,0.000827,0.480184,0.000526,0.000113,0.000263,0.344476,0.007746,0.00109,...,0.00015,0.001918,0.031962,0.000376,0.000226,0.012898,0.000639,7.5e-05,0.814657,0.17173
std,10936560.0,0.499682,0.028751,0.499617,0.022939,0.010621,0.016222,0.475206,0.087672,0.033005,...,0.012263,0.043751,0.175902,0.019388,0.015019,0.112835,0.025276,0.008672,0.388583,0.377153
min,5000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,5000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,5000.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,1736232000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
# Check the balance of our target values
y.value_counts()

1    14126
0    12468
Name: IS_SUCCESSFUL, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({1: 10611, 0: 9334})

In [16]:
from sklearn.preprocessing import StandardScaler
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Balanced Random Forest Classifier

In [17]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [18]:
# Display the confusion matrix: Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
# Calculating the confusion matrix.
rf_cm = confusion_matrix(y_test, predictions)
# Create a DataFrame from the confusion matrix.
rf_cm_df = pd.DataFrame(rf_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
rf_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2306,828
Actual 1,857,2658


In [19]:
# Calculated the balanced accuracy score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import balanced_accuracy_score
rf_bal_acc_score= round(balanced_accuracy_score(y_test, predictions),2)
accuracy_score_= round(accuracy_score(y_test, predictions),2)
accuracy_score_

0.75

In [20]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
rf_class_rpt=(classification_report_imbalanced(y_test, predictions))
print(rf_class_rpt)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.73      0.74      0.76      0.73      0.75      0.56      3134
          1       0.76      0.76      0.74      0.76      0.75      0.56      3515

avg / total       0.75      0.75      0.75      0.75      0.75      0.56      6649



In [21]:
from sklearn.metrics import recall_score
rf_recall_0=recall_score(y_test, predictions, average="binary",pos_label=0)
rf_recall_1=recall_score(y_test, predictions, average="binary",pos_label=1)

In [22]:
from sklearn.metrics import precision_score
rf_precision_0=precision_score(y_test, predictions, average="binary",pos_label=0)
rf_precision_1=precision_score(y_test, predictions, average="binary",pos_label=1)

In [23]:
# Displaying all results
print("Confusion Matrix")
display(rf_cm_df)
print(f"Balanced Accuracy Score : {rf_bal_acc_score}")
print("Classification Report")
print(rf_class_rpt)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2306,828
Actual 1,857,2658


Balanced Accuracy Score : 0.75
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.73      0.74      0.76      0.73      0.75      0.56      3134
          1       0.76      0.76      0.74      0.76      0.75      0.56      3515

avg / total       0.75      0.75      0.75      0.75      0.75      0.56      6649



In [24]:
pd.options.display.max_rows = 200
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sort_orders = sorted(zip(X.columns,importances),reverse=False)
feature_importances = pd.DataFrame(sort_orders,  columns=['Features','importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,Features,importance
0,AFFILIATION_CompanySponsored,0.243551
2,AFFILIATION_Independent,0.17238
18,ASK_AMT,0.06423
99,ORGANIZATION_Association,0.046793
102,ORGANIZATION_Trust,0.041468
56,CLASSIFICATION_C2100,0.040526
13,APPLICATION_TYPE_T5,0.039813
6,APPLICATION_TYPE_T10,0.035312
9,APPLICATION_TYPE_T19,0.032265
12,APPLICATION_TYPE_T4,0.026519
