In [31]:
# Import our dependencies
#Add Dependencies
import pandas as pd
import numpy as np
import psycopg2 as pg
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns
# Read in our ramen data
charity_df = pd.read_csv("charity_data.csv")
charity_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [32]:
#Dropping unnecessary columns
charity_df = charity_df.drop(columns=["CLASSIFICATION","APPLICATION_TYPE","EIN"],axis=1)
charity_df.head()

Unnamed: 0,NAME,AFFILIATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,BLUE KNIGHTS MOTORCYCLE CLUB,Independent,ProductDev,Association,1,0,N,5000,1
1,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,Independent,Preservation,Co-operative,1,1-9999,N,108590,1
2,ST CLOUD PROFESSIONAL FIREFIGHTERS,CompanySponsored,ProductDev,Association,1,0,N,5000,0
3,SOUTHSIDE ATHLETIC ASSOCIATION,CompanySponsored,Preservation,Trust,1,10000-24999,N,6692,1
4,GENETIC RESEARCH INSTITUTE OF THE DESERT,Independent,Heathcare,Trust,1,100000-499999,N,142590,1


In [33]:
charity_df = charity_df.dropna()

In [34]:
columns =['NAME','AFFILIATION','USE_CASE','ORGANIZATION','INCOME_AMT','STATUS','SPECIAL_CONSIDERATIONS','ASK_AMT','IS_SUCCESSFUL']

target = ['IS_SUCCESSFUL']

In [35]:
charity_df = charity_df.loc[:, columns].copy()
charity_df.shape

(34299, 9)

In [36]:
# Create a name Dataframe.
name_df=pd.DataFrame(data=charity_df['NAME'],copy=True)
name_df.head()

Unnamed: 0,NAME
0,BLUE KNIGHTS MOTORCYCLE CLUB
1,AMERICAN CHESAPEAKE CLUB CHARITABLE TR
2,ST CLOUD PROFESSIONAL FIREFIGHTERS
3,SOUTHSIDE ATHLETIC ASSOCIATION
4,GENETIC RESEARCH INSTITUTE OF THE DESERT


In [37]:
# Remove the name from the dataframe
charity_df = charity_df.drop('NAME',1)
charity_df.head()

Unnamed: 0,AFFILIATION,USE_CASE,ORGANIZATION,INCOME_AMT,STATUS,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,Independent,ProductDev,Association,0,1,N,5000,1
1,Independent,Preservation,Co-operative,1-9999,1,N,108590,1
2,CompanySponsored,ProductDev,Association,0,1,N,5000,0
3,CompanySponsored,Preservation,Trust,10000-24999,1,N,6692,1
4,Independent,Heathcare,Trust,100000-499999,1,N,142590,1


In [38]:
# Generate our categorical variable list 
charity_cat = charity_df.dtypes[charity_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
charity_df[charity_cat].nunique()

AFFILIATION               6
USE_CASE                  5
ORGANIZATION              4
INCOME_AMT                9
SPECIAL_CONSIDERATIONS    2
dtype: int64

In [39]:
# Check the affiliation unique value counts to see if binning is required
affiliation_counts= charity_df.AFFILIATION.value_counts()
affiliation_counts

Independent         18480
CompanySponsored    15705
Family/Parent          64
National               33
Regional               13
Other                   4
Name: AFFILIATION, dtype: int64

In [40]:
# Determine which values to replace for bucketing
replace_affiliation = list(affiliation_counts[affiliation_counts < 1000].index)
replace_affiliation

['Family/Parent', 'National', 'Regional', 'Other']

In [41]:
# Replace in DataFrame
for AFFILIATION in replace_affiliation:
    charity_df.AFFILIATION = charity_df.AFFILIATION.replace(AFFILIATION,"Other")

# Check to make sure binning was successful
charity_df["AFFILIATION"].value_counts()

Independent         18480
CompanySponsored    15705
Other                 114
Name: AFFILIATION, dtype: int64

In [42]:
# Check the income unique value counts to see if binning is required
income_counts= charity_df.INCOME_AMT.value_counts()
income_counts

0                24388
25000-99999       3747
100000-499999     3374
1M-5M              955
1-9999             728
10000-24999        543
10M-50M            240
5M-10M             185
50M+               139
Name: INCOME_AMT, dtype: int64

In [43]:
# Determine which values to replace for bucketing
replace_income = list(income_counts[income_counts < 800].index)
replace_income

['1-9999', '10000-24999', '10M-50M', '5M-10M', '50M+']

In [44]:
# Replace in DataFrame
for INCOME_AMT in replace_income:
    charity_df.INCOME_AMT = charity_df.INCOME_AMT.replace(INCOME_AMT,"Other")

# Check to make sure binning was successful
charity_df["INCOME_AMT"].value_counts()

0                24388
25000-99999       3747
100000-499999     3374
Other             1835
1M-5M              955
Name: INCOME_AMT, dtype: int64

In [95]:
# Filter 
charity_df = charity_df[charity_df['ASK_AMT'] <= 63000] 
charity_df.shape

(28829, 8)

In [96]:
# number of columns we will obtain after one hot encoding cat variables
encode_df=pd.get_dummies(charity_df[charity_cat], drop_first=True)
encode_df.shape

(28829, 14)

In [97]:
# Merge one-hot encoded features and drop the originals
charity2_df = charity_df.merge(encode_df,left_index=True, right_index=True)
charity2_df = charity2_df.drop(charity_cat,1)
charity2_df.head(2)

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,AFFILIATION_Independent,AFFILIATION_Other,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,ORGANIZATION_Trust,INCOME_AMT_100000-499999,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_Other,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0
2,1,5000,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [98]:
charity2_df.dtypes

STATUS                       int64
ASK_AMT                      int64
IS_SUCCESSFUL                int64
AFFILIATION_Independent      uint8
AFFILIATION_Other            uint8
USE_CASE_Heathcare           uint8
USE_CASE_Other               uint8
USE_CASE_Preservation        uint8
USE_CASE_ProductDev          uint8
ORGANIZATION_Co-operative    uint8
ORGANIZATION_Corporation     uint8
ORGANIZATION_Trust           uint8
INCOME_AMT_100000-499999     uint8
INCOME_AMT_1M-5M             uint8
INCOME_AMT_25000-99999       uint8
INCOME_AMT_Other             uint8
SPECIAL_CONSIDERATIONS_Y     uint8
dtype: object

In [99]:
# Define the features set.
X = charity2_df.copy()
X = X.drop('IS_SUCCESSFUL', axis=1)
X.shape

(28829, 16)

In [100]:
# Define the target set(output label)
y = charity_df['IS_SUCCESSFUL']
y.head(2)

0    1
2    0
Name: IS_SUCCESSFUL, dtype: int64

In [101]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [102]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

In [103]:
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

In [104]:
# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [105]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [106]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [107]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 1, 0, ..., 0, 0, 1])

In [108]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2399,991
Actual 1,1162,2656


In [109]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.7013041065482797

In [110]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2399,991
Actual 1,1162,2656


Accuracy Score : 0.7013041065482797
Classification Report
              precision    recall  f1-score   support

           0       0.67      0.71      0.69      3390
           1       0.73      0.70      0.71      3818

    accuracy                           0.70      7208
   macro avg       0.70      0.70      0.70      7208
weighted avg       0.70      0.70      0.70      7208



In [111]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([5.88095219e-04, 3.14881865e-01, 5.13271258e-01, 6.29066262e-03,
       6.44797070e-04, 9.97548334e-05, 1.26433472e-02, 2.16293159e-02,
       8.72299264e-03, 2.73849832e-03, 9.38077798e-02, 3.75759051e-03,
       1.00514295e-03, 6.92603693e-03, 1.23724872e-02, 6.20375674e-04])

In [112]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5132712582932315, 'AFFILIATION_Independent'),
 (0.31488186477029595, 'ASK_AMT'),
 (0.0938077798126058, 'ORGANIZATION_Trust'),
 (0.021629315890197745, 'USE_CASE_ProductDev'),
 (0.012643347241624815, 'USE_CASE_Preservation'),
 (0.012372487220405473, 'INCOME_AMT_Other'),
 (0.008722992641167857, 'ORGANIZATION_Co-operative'),
 (0.006926036930492342, 'INCOME_AMT_25000-99999'),
 (0.006290662618371397, 'AFFILIATION_Other'),
 (0.003757590512410113, 'INCOME_AMT_100000-499999'),
 (0.002738498320495339, 'ORGANIZATION_Corporation'),
 (0.0010051429532385277, 'INCOME_AMT_1M-5M'),
 (0.0006447970698068841, 'USE_CASE_Heathcare'),
 (0.0006203756735295221, 'SPECIAL_CONSIDERATIONS_Y'),
 (0.0005880952187568521, 'STATUS'),
 (9.975483336995748e-05, 'USE_CASE_Other')]

In [114]:
# Create the Keras Sequential model
import tensorflow as tf
nn_model = tf.keras.models.Sequential()

In [115]:
# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="relu", input_dim=2))

In [116]:
# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

In [117]:
# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1)                 3         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 2         
Total params: 5
Trainable params: 5
Non-trainable params: 0
_________________________________________________________________


In [118]:
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [119]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, <class 'pandas.core.series.Series'>