# Sampling Data

The purpose of this notebook is to extract 10% of the data for feature selection and random search. This is necessary because the large size of the feature matrix makes many operations infeasible. 

## Roadmap

1. Sample 10% of the data
2. Convert numeric columns to `np.float32`
3. Convert boolean columns to `np.int8`
4. One-hot encode features
5. Remove columns with duplicated values
6. Remove columns with more than 90% missing values
7. Remove columns with a single unique value
8. Remove one of every pair of columns with abs(correlation) > 0.95

Then we will review the number of columns and apply the same operations to the full dataset when possible.

In [1]:
import pandas as pd
import numpy as np

In [None]:
# feature_matrix = pd.read_csv('../input/feature_matrix.csv', low_memory=False)

## Sampling 10% of the original data
# train = feature_matrix[feature_matrix['TARGET'].notnull()].sample(frac = 0.1, random_state = 50)

# import gc
# gc.enable()
# del feature_matrix
# gc.collect()

### Correct column types

In [20]:
for col in ['SUM(bureau.PREVIOUS_OTHER_LOAN_RATE)', 'SUM(bureau.PREVIOUS_OTHER_LOAN_RATE WHERE CREDIT_ACTIVE = Closed)',
            'SUM(bureau.PREVIOUS_OTHER_LOAN_RATE WHERE CREDIT_ACTIVE = Active)', 'SUM(bureau_balance.bureau.PREVIOUS_OTHER_LOAN_RATE)']:
    train[col] = train[col].astype(np.float32)
    
for col in train:
    if train[col].dtype == 'bool':
        train[col] = train[col].astype(np.uint8)

In [21]:
train = pd.get_dummies(train)
n_features_start = train.shape[1] - 2
train.shape

(30751, 3078)

### Columns with duplicated values

In [56]:
x, idx, inv, counts = np.unique(train, axis = 1, return_index = True, return_inverse=True, return_counts=True)
train = train.iloc[:, idx]
n_non_unique_columns = n_features_start - train.shape[1] - 2
train.shape

(30751, 2797)

### Missing Values

In [57]:
missing_threshold = 90

# Find missing and percentage
missing = pd.DataFrame(train.isnull().sum())
missing['percent'] = 100 * (missing[0] / train.shape[0])
missing.sort_values('percent', ascending = False, inplace = True)

# Missing above threshold
missing_cols = list(missing[missing['percent'] > missing_threshold].index)
n_missing_cols = len(missing_cols)

train = train[[x for x in train if x not in missing_cols]]
train.shape

(30751, 2584)

### Zero variance columns

In [58]:
unique_counts = pd.DataFrame(train.nunique()).sort_values(0, ascending = True)
zero_variance_cols = list(unique_counts[unique_counts[0] == 1].index)
n_zero_variance_cols = len(zero_variance_cols)

train = train[[x for x in train if x not in zero_variance_cols]]
train.shape

(30751, 2429)

# Remove columns containing derivations of target

In [59]:
for col in train:
    if 'TARGET' in col:
        print(col)
        
train.drop(columns = 'PERCENTILE(TARGET)', inplace = True)

TARGET
PERCENTILE(TARGET)


### Find Correlations and Remove any above threshold

In [60]:
correlation_threshold = 0.95

corr_matrix = train.corr()

# Extract the upper triangle of the correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))

# Select the features with correlations above the threshold
# Need to use the absolute value
to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]

In [61]:
train = train[[x for x in train if x not in to_drop]]
n_collinear = len(to_drop)
train.shape

(30751, 1289)

In [62]:
n_non_unique_columns

277

In [63]:
n_missing_cols

213

In [64]:
n_zero_variance_cols

155

In [65]:
n_collinear

1139

In [66]:
total_removed = n_non_unique_columns + n_missing_cols + n_zero_variance_cols + n_collinear + 1
print('Total columns removed: ', total_removed)

Total columns removed:  1785


In [70]:
train.head()

Unnamed: 0,DAYS_BIRTH,SUM(installments.DAYS_ENTRY_PAYMENT),MIN(previous.SUM(installments.DAYS_ENTRY_PAYMENT)),MEAN(previous.SUM(installments.DAYS_ENTRY_PAYMENT)),MAX(previous.SUM(installments.DAYS_ENTRY_PAYMENT)),DAYS_EMPLOYED,SUM(bureau.DAYS_CREDIT),DAYS_REGISTRATION,SUM(previous.DAYS_DECISION),SUM(previous.DAYS_DECISION WHERE NAME_CONTRACT_STATUS = Approved),...,MEAN(credit.AMT_PAYMENT_TOTAL_CURRENT WHERE NAME_CONTRACT_STATUS = Active),MEAN(credit.AMT_PAYMENT_CURRENT WHERE NAME_CONTRACT_STATUS = Active),MIN(previous.SUM(credit.AMT_DRAWINGS_POS_CURRENT)),SUM(credit.AMT_INST_MIN_REGULARITY WHERE NAME_CONTRACT_STATUS = Active),SUM(credit.AMT_DRAWINGS_ATM_CURRENT WHERE NAME_CONTRACT_STATUS = Active),SUM(credit.AMT_DRAWINGS_CURRENT WHERE NAME_CONTRACT_STATUS = Active),SUM(credit.AMT_PAYMENT_TOTAL_CURRENT WHERE NAME_CONTRACT_STATUS = Active),SUM(credit.AMT_CREDIT_LIMIT_ACTUAL WHERE NAME_CONTRACT_STATUS = Active),MEAN(bureau.PREVIOUS_OTHER_LOAN_RATE WHERE CREDIT_ACTIVE = Closed),MEAN(bureau.PERCENTILE(PREVIOUS_OTHER_LOAN_RATE) WHERE CREDIT_ACTIVE = Closed)
77158,-14017.0,-9252.0,-9252.0,-9252.0,-9252.0,-3747.0,-2907.0,-2384.0,-2375.0,-2375.0,...,,,,,,,,,,
306191,-16520.0,-171424.0,-39707.0,-17142.4,-638.0,-4275.0,-29411.0,-3198.0,-17739.0,-15354.0,...,,,,,,,,,,
64916,-20741.0,-20122.0,-13200.0,-10061.0,-6922.0,,-4398.0,-1882.0,-2900.0,-2900.0,...,,,,,,,,,,
81133,-9685.0,,,,,-318.0,-775.0,-378.0,,,...,,,,,,,,,,
231607,-20891.0,-71851.0,-22759.0,-8981.375,-1782.0,-413.0,-1294.0,-3154.0,-9801.0,-9801.0,...,,,,,,,,,,


In [75]:
train.to_csv('../input/feature_matrix_sample.csv', index = False)

In [67]:
def feature_selection(feature_matrix, missing_threshold=90, correlation_threshold=0.95):
    """Feature selection for a dataframe."""
    
    feature_matrix = pd.get_dummies(feature_matrix)
    n_features_start = feature_matrix.shape[1] - 2
    print('Original shape: ', feature_matrix.shape)

    _, idx = np.unique(feature_matrix, axis = 1, return_index = True)
    feature_matrix = feature_matrix.iloc[:, idx]
    n_non_unique_columns = n_features_start - feature_matrix.shape[1] - 2
    print('Shape after removing non-unique valued columns: ', feature_matrix.shape)

    # Find missing and percentage
    missing = pd.DataFrame(feature_matrix.isnull().sum())
    missing['percent'] = 100 * (missing[0] / feature_matrix.shape[0])
    missing.sort_values('percent', ascending = False, inplace = True)

    # Missing above threshold
    missing_cols = list(missing[missing['percent'] > missing_threshold].index)
    n_missing_cols = len(missing_cols)

    # Remove missing columns
    feature_matrix = feature_matrix[[x for x in feature_matrix if x not in missing_cols]]
    print('Shape after removing missing: {} with threshold: {}.'.format(feature_matrix.shape,
                                                                        missing_threshold))
    
    # Zero variance
    unique_counts = pd.DataFrame(feature_matrix.nunique()).sort_values(0, ascending = True)
    zero_variance_cols = list(unique_counts[unique_counts[0] == 1].index)
    n_zero_variance_cols = len(zero_variance_cols)

    # Remove zero variance columns
    feature_matrix = feature_matrix[[x for x in feature_matrix if x not in zero_variance_cols]]
    print('Shape after removing zero variance: ', feature_matrix.shape)
    
    # Correlations
    corr_matrix = feature_matrix.corr()

    # Extract the upper triangle of the correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))

    # Select the features with correlations above the threshold
    # Need to use the absolute value
    to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]

    n_collinear = len(to_drop)
    
    feature_matrix = feature_matrix[[x for x in feature_matrix if x not in to_drop]]
    print('Shape after removing collinear: {} with threshold: {}.'.format(feature_matrix.shape,
                                                                          correlation_threshold))
    
    total_removed = n_non_unique_columns + n_missing_cols + n_zero_variance_cols + n_collinear
    
    print('Total columns removed: ', total_removed)
    return feature_matrix

## Repeat Process with Manual Features

In [69]:
manual_features = pd.read_csv('../input/features_manual.csv')
manual_features = manual_features[manual_features['TARGET'].notnull()].sample(frac = 0.1, random_state = 10)

manual_features = feature_selection(manual_features, 90, 0.95)

Original shape:  (30751, 273)
Shape after removing non-unique valued columns:  (30751, 272)
Shape after removing missing: (30751, 270) with threshold: 90.
Shape after removing zero variance:  (30751, 269)
Shape after removing collinear: (30751, 230) with threshold: 0.95.
Total columns removed:  39


In [71]:
manual_features.head()

Unnamed: 0,DAYS_BIRTH,DAYS_ID_PUBLISH,DAYS_EMPLOYED,DAYS_LAST_PHONE_CHANGE,DAYS_REGISTRATION,PREVIOUS_OTHER_LOAN_LENGTH_SUM,CASH_AVERAGE_LOAN_LENGTH,PREVIOUS_OTHER_LOAN_LENGTH_MEAN,FLAG_DOCUMENT_2,FLAG_DOCUMENT_10,...,APARTMENTS_AVG,FLOORSMAX_MODE,TOTALAREA_MODE,FLOORSMIN_AVG,YEARS_BUILD_AVG,YEARS_BEGINEXPLUATATION_AVG,OWN_CAR_AGE,CREDIT_CARD_AVERAGE_LOAN_LENGTH,CREDIT_CARD_AMT_BALANCE_MEAN_MEAN,EXT_SOURCE_1
179491,-14921.0,-4632.0,-3844.0,-2000.0,-602.0,-136.0,-87.473684,-34.0,0.0,0.0,...,,,,,,,,,,
261005,-15458.0,-3761.0,-694.0,-640.0,-9544.0,,,,0.0,0.0,...,0.3309,0.3333,0.3619,0.375,0.7756,0.9836,1.0,,,
165031,-15603.0,-5051.0,-334.0,-2200.0,-3873.0,,-59.765957,,0.0,0.0,...,0.3691,0.3333,0.2899,0.375,0.694,0.9776,2.0,-96.0,171718.959789,
288742,-16392.0,-4407.0,-3015.0,-2152.0,-8902.0,-155.0,-49.116667,-31.0,0.0,0.0,...,0.2474,0.4583,0.3115,0.0417,0.7212,0.9801,,,,
162776,-10941.0,-3215.0,-4026.0,-425.0,-910.0,-113.0,-14.0,-28.25,0.0,0.0,...,0.0696,0.0417,0.0892,,,0.9752,3.0,,,0.279198


In [73]:
manual_features.to_csv('../input/features_manual_sample.csv', index = False)

## Repeat Process with Semi-Automated Features

In [77]:
semi_features = pd.read_csv('../input/features_semi.csv')
semi_features = semi_features[semi_features['TARGET'].notnull()].sample(frac = 0.1, random_state = 50)

semi_features = feature_selection(semi_features, 90, 0.95)

Original shape:  (30751, 1447)
Shape after removing non-unique valued columns:  (30751, 1447)
Shape after removing missing: (30751, 1439) with threshold: 90.
Shape after removing zero variance:  (30751, 1396)
Shape after removing collinear: (30751, 880) with threshold: 0.95.
Total columns removed:  563


In [78]:
semi_features.head()

Unnamed: 0,DAYS_BIRTH,IN_CLIENT_IN_LOAN_DAYS_ENTRY_PAYMENT_sum_sum,IN_CLIENT_IN_LOAN_DAYS_ENTRY_PAYMENT_sum_min,IN_CLIENT_IN_LOAN_DAYS_ENTRY_PAYMENT_sum_mean,IN_CLIENT_IN_LOAN_DAYS_ENTRY_PAYMENT_sum_max,DAYS_EMPLOYED,BUREAU_DAYS_CREDIT_sum,PREVIOUS_LOAN_DIFFERENCE_MEAN,DAYS_REGISTRATION,PREVIOUS_DAYS_DECISION_sum,...,CC_CLIENT_CC_LOAN_AMT_DRAWINGS_ATM_CURRENT_max_sum,CC_CLIENT_CC_LOAN_AMT_DRAWINGS_CURRENT_max_min,CC_CLIENT_CC_LOAN_AMT_CREDIT_LIMIT_ACTUAL_mean_min,CC_CLIENT_CC_LOAN_AMT_PAYMENT_TOTAL_CURRENT_max_min,CC_CLIENT_CC_LOAN_AMT_CREDIT_LIMIT_ACTUAL_max_min,CC_CLIENT_CC_LOAN_AMT_INST_MIN_REGULARITY_sum_min,CC_CLIENT_CC_LOAN_AMT_DRAWINGS_ATM_CURRENT_sum_min,CC_CLIENT_CC_LOAN_AMT_DRAWINGS_CURRENT_sum_min,CC_CLIENT_CC_LOAN_AMT_PAYMENT_TOTAL_CURRENT_sum_min,CC_CLIENT_CC_LOAN_AMT_CREDIT_LIMIT_ACTUAL_sum_min
77158,-14017.0,-9252.0,-9252.0,-9252.0,-9252.0,-3747.0,-2907.0,-2475.0,-2384.0,-2375.0,...,,,,,,,,,,
306191,-16520.0,-171424.0,-39707.0,-17142.4,-638.0,-4275.0,-29411.0,3907.5,-3198.0,-17739.0,...,,,,,,,,,,
64916,-20741.0,-20122.0,-13200.0,-10061.0,-6922.0,,-4398.0,-7821.0,-1882.0,-2900.0,...,,,,,,,,,,
81133,-9685.0,,,,,-318.0,-775.0,,-378.0,,...,,,,,,,,,,
231607,-20891.0,-71851.0,-22759.0,-8981.375,-1782.0,-413.0,-1294.0,20321.4375,-3154.0,-9801.0,...,,,,,,,,,,


In [79]:
semi_features.to_csv('../input/features_semi_sample.csv', index = False)