# Feature Preprocessing Step 2 - One Hot Encoding

### Imports

In [1]:
import numpy as np
import pandas as pd
from   sklearn.preprocessing import OneHotEncoder

### Import Preprocessed Step 1 Feature Data

In [2]:
df = pd.read_csv("charity_data.preprocess.1.feature_elimination.csv")

In [3]:
print(f"The number of rows in the data set is: {df.IS_SUCCESSFUL.count()}")

The number of rows in the data set is: 34299


In [4]:
df.dtypes

APPLICATION_TYPE    object
AFFILIATION         object
CLASSIFICATION      object
USE_CASE            object
ORGANIZATION        object
INCOME_AMT          object
ASK_AMT              int64
IS_SUCCESSFUL        int64
dtype: object

## Encoding Catigorical Features
#### Except APPLICATION_TYPE and CLASSIFICATION

In [5]:
# Bucketing will be applyed to the features APPLICATION_TYPE and CLASSIFICATION
# Using different row count cutoff values for changing the classification value to other

In [6]:
# Generate our categorical variable list
catigorical_fields = df.dtypes[df.dtypes == "object"].index.tolist()

catigorical_fields.remove('APPLICATION_TYPE')
catigorical_fields.remove('CLASSIFICATION')

In [7]:
# Create a OneHotEncoder instance
encoder = OneHotEncoder(sparse=False, dtype=np.int64)

In [8]:
# Fit and transform the OneHotEncoder using the categorical variable list
encoded_df = pd.DataFrame(encoder.fit_transform(df[catigorical_fields]))

In [9]:
# Add the encoded variable names to the DataFrame
encoded_df.columns = encoder.get_feature_names(catigorical_fields)

In [10]:
# Merge one-hot encoded features and drop the originals
df.drop(catigorical_fields,1, inplace=True)
df = df.merge(encoded_df,left_index=True, right_index=True)

In [11]:
df.dtypes

APPLICATION_TYPE                object
CLASSIFICATION                  object
ASK_AMT                          int64
IS_SUCCESSFUL                    int64
AFFILIATION_CompanySponsored     int64
AFFILIATION_Family/Parent        int64
AFFILIATION_Independent          int64
AFFILIATION_National             int64
AFFILIATION_Other                int64
AFFILIATION_Regional             int64
USE_CASE_CommunityServ           int64
USE_CASE_Heathcare               int64
USE_CASE_Other                   int64
USE_CASE_Preservation            int64
USE_CASE_ProductDev              int64
ORGANIZATION_Association         int64
ORGANIZATION_Co-operative        int64
ORGANIZATION_Corporation         int64
ORGANIZATION_Trust               int64
INCOME_AMT_0                     int64
INCOME_AMT_1-9999                int64
INCOME_AMT_10000-24999           int64
INCOME_AMT_100000-499999         int64
INCOME_AMT_10M-50M               int64
INCOME_AMT_1M-5M                 int64
INCOME_AMT_25000-99999   

## Save Step 2 - One Hot Encoded Feature Data

In [12]:
df.to_csv("charity_data.preprocess.2.one_hot_encoded.csv", index=False)