In [64]:
# Importing dependencies
import pandas as pd

# Preprocessing

In [65]:
# Reading in charity_data.csv as a dataframe
charity_df = pd.read_csv('Resources/charity_data.csv')

charity_df.head()


Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


We are trying to build a model that can predict whether a given applicant will be succesful if funded by Alphabet Soup. The data used to train the model will use features such as Application_Type , Affiliation , Classification... etc to predict the target feature is_successful.

In [66]:
# Dropping the EIN and name columns
charity_df.drop(['EIN' , 'NAME'] , axis= 1, inplace= True)
charity_df.head(5)

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [67]:
# used df.nunique() to get count of unique items for each column
charity_df.nunique()

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.nunique.html

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64

In [68]:
# Find the unique Value counts in the Application_Type column
type_count = charity_df['APPLICATION_TYPE'].value_counts()
print(type_count)

T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64


In [69]:
# We will set the cut off at 500 , so that application types with less than 500 applications will be aggregated into the "other" group
less_than_500 = type_count[charity_df['APPLICATION_TYPE'].value_counts() < 500].index

# replacing application type with other
charity_df['APPLICATION_TYPE'] = charity_df['APPLICATION_TYPE'].replace(less_than_500 , 'Other')

# Confirming the binning was succesful
charity_df['APPLICATION_TYPE'].value_counts()


T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: APPLICATION_TYPE, dtype: int64

In [72]:
# Finding unique value counts in CLASSIFICATION
class_count = charity_df['CLASSIFICATION'].value_counts()
print(class_count)

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C1283        1
C1580        1
C2170        1
C3700        1
C5200        1
Name: CLASSIFICATION, Length: 71, dtype: int64


In [77]:
# Cut of will be set at 1000
less_than_1000 = class_count[charity_df['CLASSIFICATION'].value_counts() < 1000].index
print(less_than_1000)

Index(['C7000', 'C1700', 'C4000', 'C5000', 'C1270', 'C2700', 'C2800', 'C7100',
       'C1300', 'C1280', 'C1230', 'C1400', 'C2300', 'C7200', 'C1240', 'C8000',
       'C7120', 'C1500', 'C1800', 'C6000', 'C1250', 'C8200', 'C1278', 'C1238',
       'C1235', 'C1237', 'C7210', 'C1720', 'C2400', 'C4100', 'C1257', 'C1600',
       'C0', 'C2710', 'C1260', 'C1234', 'C1267', 'C1246', 'C3200', 'C1256',
       'C1245', 'C2500', 'C4200', 'C1570', 'C1820', 'C4500', 'C1236', 'C8210',
       'C6100', 'C2380', 'C2570', 'C2190', 'C2150', 'C1370', 'C1728', 'C2561',
       'C4120', 'C1732', 'C2600', 'C1248', 'C1900', 'C1283', 'C1580', 'C2170',
       'C3700', 'C5200'],
      dtype='object')
