# Dependencies

In [1]:
# Import our dependencies
import pandas as pd
import matplotlib as plt
import sklearn as skl
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import os
from tensorflow.keras.callbacks import ModelCheckpoint

# Formulas

In [2]:
# Generate our categorical variable list
def cat_columns(df):
    cats = df.dtypes[df.dtypes == "object"].index.tolist()
    return cats

# Get unique values in each column with categorical values
def vcount_summary(df, cat_list):
    for i in cat_list:
        print(f'Column: {i}')
        print(f'Unique Values: {df[i].nunique()}')
        print(df[i].value_counts())
        print('-------------')
        
# Make function to bucket low values into 'other' type
def other_bucket(df, column, cutoff):
    counts = df[column].value_counts()
    replacements = list(counts[counts < cutoff].index)
    
    for i in replacements:
        df[column] = df[column].replace(i, 'Other')

# Encode categorical columns and merge with primary dataframe
def encode_merge(df, cat_list):
    for i in cat_list:
        encode_df = pd.DataFrame(enc.fit_transform(df[i].values.reshape(-1,1)))
        encode_df.columns = enc.get_feature_names([i])
        df = df.merge(encode_df,left_index=True,right_index=True).drop(i,1)
    return df

# Step 1: Prep Data for Model

In [3]:
# Read the data
charity_df = pd.read_csv("charity_data.csv")
charity_df = charity_df.drop(columns=['NAME'])
charity_df.head()

Unnamed: 0,EIN,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [4]:
# Get a list of the columns with categorical values and show unique value counts in those columns
vcount_summary(charity_df, cat_list=cat_columns(charity_df))

Column: APPLICATION_TYPE
Unique Values: 17
T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T14        3
T25        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64
-------------
Column: AFFILIATION
Unique Values: 6
Independent         18480
CompanySponsored    15705
Family/Parent          64
National               33
Regional               13
Other                   4
Name: AFFILIATION, dtype: int64
-------------
Column: CLASSIFICATION
Unique Values: 71
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C1570        1
C4500        1
C1245        1
C2170        1
C4200        1
Name: CLASSIFICATION, Length: 71, dtype: int64
-------------
Column: USE_CASE
Unique Values: 5
Preservation     28095
ProductDev        5671
CommunityServ      384
Heathcare          146
Other                3
Name: USE_CASE, dtype: int64
-

In [5]:
# Bucket low-count values into 'other' column
other_bucket(df=charity_df, column='APPLICATION_TYPE', cutoff=100)
other_bucket(df=charity_df, column='CLASSIFICATION', cutoff=100)
other_bucket(df=charity_df, column='ORGANIZATION', cutoff=1000)
other_bucket(df=charity_df, column='AFFILIATION', cutoff=100)
other_bucket(df=charity_df, column='USE_CASE', cutoff=1000)

In [6]:
# Check that the bucketing worked
vcount_summary(charity_df, cat_list=cat_columns(charity_df))

Column: APPLICATION_TYPE
Unique Values: 10
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
T9         156
Other      120
Name: APPLICATION_TYPE, dtype: int64
-------------
Column: AFFILIATION
Unique Values: 3
Independent         18480
CompanySponsored    15705
Other                 114
Name: AFFILIATION, dtype: int64
-------------
Column: CLASSIFICATION
Unique Values: 12
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
Other      669
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
Name: CLASSIFICATION, dtype: int64
-------------
Column: USE_CASE
Unique Values: 3
Preservation    28095
ProductDev       5671
Other             533
Name: USE_CASE, dtype: int64
-------------
Column: ORGANIZATION
Unique Values: 3
Trust          23515
Association    10255
Other            529
Name: ORGANIZATION, dtype: int64
-------------
Column: INCOME_AMT
Unique Values: 

In [7]:
# Encode the bucketed columns and merge the encoded dataframes with primary dataframe
enc = OneHotEncoder(sparse=False)
charity_df = encode_merge(charity_df, cat_list=cat_columns(charity_df))

In [8]:
# Fit the StandardScaler
scaler = StandardScaler()

scaler.fit(charity_df)
scaled_data = scaler.transform(charity_df)

# Create a DataFrame with the scaled data
transformed_scaled_data = pd.DataFrame(scaled_data, columns=charity_df.columns)
transformed_scaled_data = transformed_scaled_data.drop(columns='IS_SUCCESSFUL')
transformed_scaled_data['IS_SUCCESSFUL'] = charity_df['IS_SUCCESSFUL']
transformed_scaled_data.head()

Unnamed: 0,EIN,STATUS,ASK_AMT,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,...,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y,IS_SUCCESSFUL
0,-2.074966,0.012075,-0.031725,-0.059253,7.997514,-0.179013,-1.929528,-0.216965,-0.188176,-0.191719,...,-0.126831,-0.330307,-0.083944,-0.169236,-0.350205,-0.063789,-0.073641,0.028068,-0.028068,1
1,-2.074921,0.012075,-0.030536,-0.059253,-0.125039,-0.179013,0.518261,-0.216965,-0.188176,-0.191719,...,-0.126831,-0.330307,-0.083944,-0.169236,-0.350205,-0.063789,-0.073641,0.028068,-0.028068,1
2,-2.074854,0.012075,-0.031725,-0.059253,-0.125039,-0.179013,-1.929528,-0.216965,5.314171,-0.191719,...,-0.126831,-0.330307,-0.083944,-0.169236,-0.350205,-0.063789,-0.073641,0.028068,-0.028068,0
3,-2.074833,0.012075,-0.031706,-0.059253,-0.125039,-0.179013,0.518261,-0.216965,-0.188176,-0.191719,...,7.884526,-0.330307,-0.083944,-0.169236,-0.350205,-0.063789,-0.073641,0.028068,-0.028068,1
4,-2.074821,0.012075,-0.030146,-0.059253,-0.125039,-0.179013,0.518261,-0.216965,-0.188176,-0.191719,...,-0.126831,3.027487,-0.083944,-0.169236,-0.350205,-0.063789,-0.073641,0.028068,-0.028068,1


# Step 2: Run Models and Compare Accuracy

### Attempt 1.1: Deep Neural Network (100 neurons and 6 hidden layers)

In [16]:
# Split our preprocessed data into our features and target arrays
y = transformed_scaled_data["IS_SUCCESSFUL"].values
X = transformed_scaled_data.drop(["IS_SUCCESSFUL","EIN"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  100
hidden_nodes_layer2 =  75
hidden_nodes_layer3 =  50
hidden_nodes_layer4 =  25
hidden_nodes_layer5 =  10
hidden_nodes_layer6 =  5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
# Add hidden layers
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="relu"))
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation="relu"))
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer6, activation="relu"))
# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
# Check the structure of the model
nn.summary()

# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train,y_train,epochs=100)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 100)               4500      
_________________________________________________________________
dense_9 (Dense)              (None, 75)                7575      
_________________________________________________________________
dense_10 (Dense)             (None, 50)                3800      
_________________________________________________________________
dense_11 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_12 (Dense)             (None, 10)                260       
_________________________________________________________________
dense_13 (Dense)             (None, 5)                 55        
_________________________________________________________________
dense_14 (Dense)             (None, 1)                

Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [17]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.5624 - accuracy: 0.7255
Loss: 0.5623919367790222, Accuracy: 0.7254810333251953


### Attempt 1.2: Deep Neural Network (8 neurons and 2 hidden layers)

In [18]:
# Split our preprocessed data into our features and target arrays
y = transformed_scaled_data["IS_SUCCESSFUL"].values
X = transformed_scaled_data.drop(["IS_SUCCESSFUL","EIN"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 =  5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
# Add hidden layers
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
# Check the structure of the model
nn.summary()

# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train,y_train,epochs=100)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 8)                 360       
_________________________________________________________________
dense_16 (Dense)             (None, 5)                 45        
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 6         
Total params: 411
Trainable params: 411
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch

Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [19]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.5519 - accuracy: 0.7257
Loss: 0.5518501996994019, Accuracy: 0.7257142663002014


### Attempt 2: SVM

In [11]:
# Split our preprocessed data into our features and target arrays
y = transformed_scaled_data["IS_SUCCESSFUL"].values
X = transformed_scaled_data.drop(["IS_SUCCESSFUL","EIN"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create the SVM model
svm = SVC(kernel='linear')

# Train the model
svm.fit(X_train, y_train)

# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SVM model accuracy: 0.720


### Attempt 3: Logistic Regression

In [12]:
# Split our preprocessed data into our features and target arrays
y = transformed_scaled_data["IS_SUCCESSFUL"].values
X = transformed_scaled_data.drop(["IS_SUCCESSFUL","EIN"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.719


### Attempt 4: Random Forest

In [13]:
# Split our preprocessed data into our features and target arrays
y = transformed_scaled_data["IS_SUCCESSFUL"].values
X = transformed_scaled_data.drop(["IS_SUCCESSFUL","EIN"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.709


## Highest accuracy rate reached: 72.6% (Attempt 1, deep neural network)