# Introduction

This notebook reports the Pearson correlation between each variable and the target outcome. While these correlations cannot tell us exactly which variables will offer the best performance in the context of a holistic productionized model, they're a good starting point for dimensionality reduction. 

In [9]:
import numpy as np
import pandas as pd
import scipy.stats

from utils import bad_rate_by_category

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)

## Data

In [10]:
# Import data and data dictionary
data = pd.read_pickle('output_data/01_data.pkl')

data_dict = pd.read_pickle('output_data/01_data_dict.pkl')

## Potential Features

In [11]:
# Designate eda categories as containing features
prelim_feature_dict = ['credit_score', 'personal_finance', 'other_info']
data_dict['potential_feature'] = data_dict.apply(lambda x: x.eda_category in prelim_feature_dict, axis=1)

## Pearson Correlations

In [12]:
# Function to evaluate pearson correlation
def correlate_pearson(var1, var2, data):
    # Drop rows without either input or output variable
    data_notna = data.dropna(axis=0, subset=[var1, var2])
    
    # Run Pearson correlation
    pearson, pearson_p = scipy.stats.pearsonr(data_notna[var1], data_notna[var2])
    
    return pearson, pearson_p

In [13]:
# Designate list of potential model features
potential_features = data_dict.loc[data_dict['potential_feature']==True, 'variable'].values

# For each potential feature, calculate Pearson's correlation 
# and its p-value and store them in the data dictionary
for var in potential_features:
    pearson, pearson_p = correlate_pearson(var, 'bad', data)
    data_dict.loc[data_dict['variable']==var,'pearson'] = pearson
    data_dict.loc[data_dict['variable']==var,'pearson_p'] = pearson_p

In [15]:
data_dict.sort_values('pearson_p').reset_index(drop=True)

Unnamed: 0,variable,var_dtype,eda_category,categorical,coverage,potential_feature,pearson,pearson_p
0,raw_FICO_retail,int64,credit_score,0,1.0,True,-0.180558,4e-06
1,payment_frequency_bi_weekly,uint8,personal_finance,-1,1.0,True,0.177818,5e-06
2,payment_frequency_per_month,int64,personal_finance,0,1.0,True,0.160181,4.1e-05
3,raw_FICO_telecom,int64,credit_score,0,1.0,True,-0.156673,6e-05
4,raw_FICO_bank_card,int64,credit_score,0,1.0,True,-0.152722,9.3e-05
5,raw_FICO_money,int64,credit_score,0,1.0,True,-0.152437,9.6e-05
6,payment_frequency_monthly,uint8,personal_finance,-1,1.0,True,-0.136097,0.000503
7,payment_frequency_semi_monthly,uint8,personal_finance,-1,1.0,True,-0.129028,0.000977
8,payment_frequency_weekly,uint8,personal_finance,-1,1.0,True,0.103345,0.00837
9,bank_account_duration_months,float64,other_info,0,0.998462,True,-0.102491,0.008926


## Export Data

In [8]:
data.to_pickle('output_data/02_data.pkl')
data_dict.to_pickle('output_data/02_data_dict.pkl')