# Subsetting Data

In this notebook, we subset the full feature matrices to the features identified in the `Sampling` notebook. The `_sample.csv` files are the sampled version (10% of the observations) with the feature selection applied. The `fm` dataframes are the full data. The full data after being subsetted is then saved as `_selected.csv`.

In [5]:
import pandas as pd
import numpy as np

# Manual Features

In [None]:
sample = pd.read_csv('../input/features_manual_sample.csv')
fm = pd.read_csv('../input/features_manual.csv')

# One-hot encoding
fm = pd.get_dummies(fm)
fm.shape

In [None]:
# Subset to the columns in the sample
fm = fm[sample.columns]
fm.shape

In [None]:
# Save to csv
fm.to_csv('../input/features_manual_selected.csv', index = False)

# Featuretools Features

In [2]:
# Read in sample and full data
sample = pd.read_csv('../input/feature_matrix_sample.csv')
fm = pd.read_csv('../input/feature_matrix.csv')

# Convert the column types
for col in ['SUM(bureau.PREVIOUS_OTHER_LOAN_RATE)', 'SUM(bureau.PREVIOUS_OTHER_LOAN_RATE WHERE CREDIT_ACTIVE = Closed)',
            'SUM(bureau.PREVIOUS_OTHER_LOAN_RATE WHERE CREDIT_ACTIVE = Active)', 'SUM(bureau_balance.bureau.PREVIOUS_OTHER_LOAN_RATE)']:
    fm[col] = fm[col].astype(np.float32)
    
print(fm.shape)

# One hot encoding
cat = pd.get_dummies(fm.select_dtypes('object'))

# Convert the column types
for col in fm:
    if fm[col].dtype == 'bool':
        fm[col] = fm[col].astype(np.uint8)
        
# Add the one-hot encoded columns
fm = fm.select_dtypes(['number'])
fm = pd.concat([fm, cat], axis = 1)
fm.shape

In [7]:
# Subset to the columns in the sample
fm = fm[sample.columns]
fm.shape

(356255, 1289)

In [9]:
# Save
fm.to_csv('../input/feature_matrix_select.csv', index = False)

# Semi-Automated Features

In [14]:
sample = pd.read_csv('../input/features_semi_sample.csv')
fm = pd.read_csv('../input/features_semi.csv')

# One hot encoding
fm = pd.get_dummies(fm)
fm.shape

(356255, 1447)

In [15]:
# Subset to the columns in sample
fm = fm[sample.columns]
fm.shape

(356255, 880)

In [16]:
fm.to_csv('../input/features_semi_selected.csv', index = False)