In [98]:
# Importing dependencies
import pandas as pd

# Preprocessing

In [99]:
# Reading in charity_data.csv as a dataframe
charity_df = pd.read_csv('Resources/charity_data.csv')

charity_df.head()


Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


We are trying to build a model that can predict whether a given applicant will be succesful if funded by Alphabet Soup. The data used to train the model will use features such as Application_Type , Affiliation , Classification... etc to predict the target feature is_successful.

In [100]:
# Dropping the EIN and name columns
charity_df.drop(['EIN' , 'NAME'] , axis= 1, inplace= True)
charity_df.head(5)

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [101]:
# used df.nunique() to get count of unique items for each column
charity_df.nunique()

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.nunique.html

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64

In [102]:
# Find the unique Value counts in the Application_Type column
type_count = charity_df['APPLICATION_TYPE'].value_counts()
print(type_count)

T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64


In [103]:
# We will set the cut off at 500 , so that application types with less than 500 applications will be aggregated into the "other" group
less_than_500 = type_count[charity_df['APPLICATION_TYPE'].value_counts() < 500].index

# replacing application type with other
charity_df['APPLICATION_TYPE'] = charity_df['APPLICATION_TYPE'].replace(less_than_500 , 'Other')

# Confirming the binning was succesful
charity_df['APPLICATION_TYPE'].value_counts()


T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: APPLICATION_TYPE, dtype: int64

In [104]:
# Finding unique value counts in CLASSIFICATION
class_count = charity_df['CLASSIFICATION'].value_counts()
print(class_count)

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C1283        1
C1580        1
C2170        1
C3700        1
C5200        1
Name: CLASSIFICATION, Length: 71, dtype: int64


In [105]:
# Cut of will be set at 1000
less_than_1000 = class_count[charity_df['CLASSIFICATION'].value_counts() < 1000].index

# Replacing the listed classes with other
charity_df['CLASSIFICATION'] = charity_df['CLASSIFICATION'].replace(less_than_1000 , 'other')

# Confirming succesful replacement
charity_df['CLASSIFICATION'].value_counts()

C1000    17326
C2000     6074
C1200     4837
other     2261
C3000     1918
C2100     1883
Name: CLASSIFICATION, dtype: int64

In [106]:
# Convert Categorical values into numeric
numeric_df = pd.get_dummies(charity_df)
numeric_df.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,108590,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,1,5000,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,1,6692,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,1,142590,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [107]:
y = numeric_df.pop('IS_SUCCESSFUL') # Pop off "Is_Succesful" our target column
X = numeric_df # the remaining features after the pop off

In [110]:
from sklearn.model_selection import train_test_split
# Splitting data into training and testing sets
X_train , X_test , y_train , y_test = train_test_split(X , y , random_state= 78)


In [111]:
from sklearn.preprocessing import StandardScaler
# scaling the data sets
scaler = StandardScaler()

# Fitting the scaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Compile , Train , Evaluate