# Feature Processing Step 1

### Imports

In [1]:
import numpy as np
import pandas as pd

### Import Source Data

In [2]:
df = pd.read_csv("charity_data.csv")

## Feature Analysis

### Number of rows in the data set

In [3]:
print(f"The number of rows in the data set is: {df.EIN.count()}")

The number of rows in the data set is: 34299


### Drop rows with Nan field values

In [4]:
df.dropna()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


In [5]:
print(f"The number of rows in the data set after droping rows with Nan field values is: {df.EIN.count()}")

The number of rows in the data set after droping rows with Nan field values is: 34299


### Features Data Types

In [6]:
df.dtypes

EIN                        int64
NAME                      object
APPLICATION_TYPE          object
AFFILIATION               object
CLASSIFICATION            object
USE_CASE                  object
ORGANIZATION              object
STATUS                     int64
INCOME_AMT                object
SPECIAL_CONSIDERATIONS    object
ASK_AMT                    int64
IS_SUCCESSFUL              int64
dtype: object

### Features Unique Value Counts

In [7]:
df.nunique()

EIN                       34299
NAME                      19568
APPLICATION_TYPE             17
AFFILIATION                   6
CLASSIFICATION               71
USE_CASE                      5
ORGANIZATION                  4
STATUS                        2
INCOME_AMT                    9
SPECIAL_CONSIDERATIONS        2
ASK_AMT                    8747
IS_SUCCESSFUL                 2
dtype: int64

## Individual Feature Analysis

### Feature Describe Function

In [8]:
def describe_feature(feature_name):
    
    # Get feature catigory row counts
    feature_catigories_row_cnt_ser = df[feature_name].value_counts()
    
    # Get feature catigories
    feature_num_unique_catigories = len(feature_catigories_row_cnt_ser)

    # Print feature catigory count
    print(f"The number of unique catigories for feature: {feature_name} is: {feature_num_unique_catigories}")
    
    # Display feature catigory row counts
    return feature_catigories_row_cnt_ser

###  Feature APPLICATION_TYPE

In [9]:
describe_feature("APPLICATION_TYPE").head(100)

The number of unique catigories for feature: APPLICATION_TYPE is: 17


T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T15        2
T29        2
T17        1
Name: APPLICATION_TYPE, dtype: int64

### Feature AFFILIATION

In [10]:
describe_feature("AFFILIATION").head(100)

The number of unique catigories for feature: AFFILIATION is: 6


Independent         18480
CompanySponsored    15705
Family/Parent          64
National               33
Regional               13
Other                   4
Name: AFFILIATION, dtype: int64

### Feature CLASSIFICATION

In [11]:
classification_ser = describe_feature("CLASSIFICATION").head(100)
for classification in classification_ser.index:
    print(f"{classification}      {classification_ser[classification]}")

The number of unique catigories for feature: CLASSIFICATION is: 71
C1000      17326
C2000      6074
C1200      4837
C3000      1918
C2100      1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
C2800      95
C7100      75
C1300      58
C1280      50
C1230      36
C1400      34
C2300      32
C7200      32
C1240      30
C8000      20
C7120      18
C1500      16
C6000      15
C1800      15
C1250      14
C8200      11
C1238      10
C1278      10
C1237      9
C1235      9
C7210      7
C1720      6
C2400      6
C4100      6
C1600      5
C1257      5
C0      3
C2710      3
C1260      3
C1256      2
C1234      2
C1246      2
C1267      2
C3200      2
C2570      1
C1900      1
C3700      1
C8210      1
C6100      1
C2150      1
C4200      1
C2170      1
C1236      1
C4120      1
C2561      1
C1820      1
C1728      1
C2600      1
C4500      1
C1283      1
C1248      1
C5200      1
C2190      1
C2380      1
C1580      1
C1370      1
C1570      1
C1245 

### Feature USE_CASE

In [12]:
describe_feature("USE_CASE").head(100)

The number of unique catigories for feature: USE_CASE is: 5


Preservation     28095
ProductDev        5671
CommunityServ      384
Heathcare          146
Other                3
Name: USE_CASE, dtype: int64

### Feature ORGANIZATION

In [13]:
describe_feature("ORGANIZATION").head(100)

The number of unique catigories for feature: ORGANIZATION is: 4


Trust           23515
Association     10255
Co-operative      486
Corporation        43
Name: ORGANIZATION, dtype: int64

### Feature STATUS

In [14]:
describe_feature("STATUS").head(100)

The number of unique catigories for feature: STATUS is: 2


1    34294
0        5
Name: STATUS, dtype: int64

### Feature INCOME_AMT

In [15]:
describe_feature("INCOME_AMT").head(100)

The number of unique catigories for feature: INCOME_AMT is: 9


0                24388
25000-99999       3747
100000-499999     3374
1M-5M              955
1-9999             728
10000-24999        543
10M-50M            240
5M-10M             185
50M+               139
Name: INCOME_AMT, dtype: int64

### Feature SPECIAL_CONSIDERATIONS

In [16]:
describe_feature("SPECIAL_CONSIDERATIONS").head(100)

The number of unique catigories for feature: SPECIAL_CONSIDERATIONS is: 2


N    34272
Y       27
Name: SPECIAL_CONSIDERATIONS, dtype: int64

### Feature ASK_AMT

In [17]:
df["ASK_AMT"].describe().apply(lambda x: format(x, '15,.2f')).head(100)

count           34,299.00
mean         2,769,198.68
std         87,130,452.44
min              5,000.00
25%              5,000.00
50%              5,000.00
75%              7,742.00
max      8,597,806,340.00
Name: ASK_AMT, dtype: object

# Dimensionality Reduction - Feature Elimination

In [18]:
# The following features
#
# NAME
# EIN
#
# are identity features and should not be
# used as to construct the predictive model.
# Predictive models can only use general 
# descriptive features that are common to 
# all the entities epresented in by the data set.
# Identity fields only refere to a specific
# entity in the data set


# The information from the features
#
# STATUS
# SPECIAL_CONSIDERATIONS
#
# are not unique enough to add predictive value to the model

df.drop(["NAME", "EIN", "STATUS", "SPECIAL_CONSIDERATIONS"], axis=1, inplace=True)

# Save Step 1 Preprocessed Features

In [19]:
df.to_csv("charity_data.preprocess.1.feature_elimination.csv", index=False)