<a href="https://colab.research.google.com/github/alphas7/Santander-Customer-Satisfaction/blob/master/Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!unzip -uq "/content/drive/My Drive/Santander_Customer_Satisfaction/pickle_files.zip" -d "/content/"

In [3]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(palette='muted',style='whitegrid')
from sklearn.manifold import TSNE
import sklearn
import scipy
import random

In [50]:
train = pd.read_pickle('pickle_files/train_1.pkl')
test = pd.read_pickle('pickle_files/test_1.pkl')
train_a = train.copy()
test_a = test.copy()

In [51]:
X_test = test.copy()

X_train,X_val = sklearn.model_selection.train_test_split(train,stratify=train['TARGET'],test_size=0.1,random_state=100,)
X_train.reset_index(inplace=True,drop=True)
X_val.reset_index(inplace=True,drop=True)
X_train.shape,X_val.shape,X_test.shape

((68418, 144), (7602, 144), (75818, 143))

# No. of zeros

In [52]:
def add_feature_no_zeros(train=X_train,test=X_test,val=X_val):
    """
    Add feature which tells the no. of occurences of zeros across a datapoint
    """
    col = [k for k in train.columns if k!='TARGET']
    for df in [train,test,val]:
        df['no_zeros'] = (df.loc[:,col]==0).sum(axis=1).values

In [53]:
def add_feature_no_zeros_keyword(keyword,train=X_train,test=X_test,val=X_val):
    """
    Add feature which tells the no. of occurences of zeros across a datapoint
    for a specific keyword
    """
    col = [k for k in train.columns if keyword in k]
    for df in [train,test,val]:
        df['no_zeros_'+k] = (df.loc[:,col]==0).sum(axis=1).values

In [54]:
#adding no. of zeros feature
add_feature_no_zeros()
X_train.shape,X_val.shape,X_test.shape

((68418, 145), (7602, 145), (75818, 144))

In [56]:
keywords = ['saldo' , 'ind', 'num', 'imp']
for k in keywords:
  add_feature_no_zeros_keyword(k)
X_train.shape,X_val.shape,X_test.shape

((68418, 149), (7602, 149), (75818, 148))

In [57]:
#getting all columns with less than or equal to 10 nunique values
cat_col = []
for col in X_train.columns:
  if (X_train[col].nunique()<=10) & (col!='TARGET') & (X_train[col].nunique()>2):
    cat_col.append(col)
print("There are %i columns which have less than or equal to 10 and greater than 2 number of unique values. \nWe will create \
new datasets which use one hot encoding,\
response encoding and also leaving the columns as it is."%(len(cat_col)))

There are 31 columns which have less than or equal to 10 and greater than 2 number of unique values. 
We will create new datasets which use one hot encoding,response encoding and also leaving the columns as it is.


# One Hot Encoding

In [58]:
def one_hot_encoding(col,train=X_train,test=X_test,val=X_val):
  """
  one hot encodes all the features in col array in all X dataset
  """
  ohe = sklearn.preprocessing.OneHotEncoder(sparse=True,handle_unknown='ignore')
  ohe.fit(train.loc[:,col])
  feature_names = list(ohe.get_feature_names(input_features=col))
  features = list(train.drop(col,axis=1).columns)
  features.extend(feature_names)

  #train
  df = train.copy()
  temp = ohe.transform(df[cat_col])
  df.drop(col,axis=1,inplace=True)
  train = pd.DataFrame(scipy.sparse.hstack([df.values,temp]).toarray(),columns=features).copy()

  #val
  df = val.copy()
  temp = ohe.transform(df[cat_col])
  df.drop(col,axis=1,inplace=True)
  val = pd.DataFrame(scipy.sparse.hstack([df.values,temp]).toarray(),columns=features).copy()

  #test
  df = test.copy()
  temp = ohe.transform(df[cat_col])
  df.drop(col,axis=1,inplace=True)
  features.remove('TARGET')
  test = pd.DataFrame(scipy.sparse.hstack([df.values,temp]).toarray(),columns=features).copy()

  return train,test,val

In [59]:
X_train_ohe,X_test_ohe,X_val_ohe = one_hot_encoding(cat_col)
X_train_ohe.shape,X_val_ohe.shape,X_test_ohe.shape

((68418, 291), (7602, 291), (75818, 290))

# Response Encoding

In [60]:
def response_encoding_return(df,column,target,alpha=5000):#cv_df,test_df,
    """
    Here we are using response encoding with laplace smoothing to a categorical column
    and transform the respective column in train,test,val datasets.
    Here we will be repeating the values of each category alpha times.
    the function will return the response encoding values of each categorical value in that column
    """
    unique_values = set(df[column].values)
    dict_values = {}
    for value in unique_values:
        total = len(df[df[column]==value])
        sum_promoted = len(df[(df[column]==value) & df[target]==1])
        dict_values[value] = np.round((sum_promoted+alpha)/(total+alpha*len(unique_values)),2)
    return dict_values

In [61]:
def response_encoding(df,cv_df,test_df,column,target='TARGET',alpha=5000):
    """
    Here we are using response encoding with laplace smoothing to a categorical column
    and transform the respective column in train,test,val datasets.
    Here we will be repeating the values of each category alpha times.
    """
    unique_values = set(df[column].values) #all unique values in that categorical column
    dict_values = {}
    for value in unique_values:
        total = len(df[df[column]==value]) #the total no. of datapoints with 'value' catgeory
        sum_promoted = len(df[(df[column]==value) & df[target]==1]) #sum of all datapoints with category being 'value' and target==1
        dict_values[value] = np.round((sum_promoted+alpha)/(total+alpha*len(unique_values)),2) #storing the obtained result in a dictionary
    dict_values['unknown']=0.5 #unknown categories that are not seen in train will be assigned a score of 0.5
    df[column]=(df[column].map(dict_values)).values
    unique_values_test = set(test_df[column].values)
    unique_values_cv = set(cv_df[column].values)
    cv_df[column]=cv_df[column].apply(lambda x: 'unknown' if x in (unique_values_cv-unique_values) else x )
    test_df[column]=test_df[column].apply(lambda x: 'unknown' if x in (unique_values_test-unique_values) else x )
    cv_df[column] = (cv_df[column].map(dict_values)).values
    test_df[column] = (test_df[column].map(dict_values)).values

In [62]:
# finding the best alpha
random.seed(100)
ran_in = random.randint(0,10)
col = [col for col in cat_col if X_train[col].nunique()>3][ran_in]
col
print('Feature: "%s"'%(col))
for alpha in [100,500,1000,2500,5000,10000]:
  print('for alpha %i:%s'%(alpha,response_encoding_return(X_train,col,"TARGET",alpha=alpha)))

Feature: "num_var5_0"
for alpha 100:{0: 0.09, 3: 0.04, 6: 0.13, 9: 0.2, 15: 0.2}
for alpha 500:{0: 0.13, 3: 0.04, 6: 0.18, 9: 0.2, 15: 0.2}
for alpha 1000:{0: 0.15, 3: 0.05, 6: 0.19, 9: 0.2, 15: 0.2}
for alpha 2500:{0: 0.18, 3: 0.06, 6: 0.2, 9: 0.2, 15: 0.2}
for alpha 5000:{0: 0.19, 3: 0.08, 6: 0.2, 9: 0.2, 15: 0.2}
for alpha 10000:{0: 0.19, 3: 0.11, 6: 0.2, 9: 0.2, 15: 0.2}


In [63]:
random.seed(1000)
ran_in = random.randint(0,10)
col = [col for col in cat_col if X_train[col].nunique()>3][ran_in]
# finding the best alpha
print('Feature: "%s"'%(col))
for alpha in [100,500,1000,2500,5000,10000]:
  print('for alpha %i:%s'%(alpha,response_encoding_return(X_train,col,"TARGET",alpha=alpha)))

Feature: "num_var13_largo_0"
for alpha 100:{0: 0.04, 3: 0.08, 6: 0.13, 9: 0.14, 12: 0.14, 15: 0.14, 18: 0.14}
for alpha 500:{0: 0.04, 3: 0.12, 6: 0.14, 9: 0.14, 12: 0.14, 15: 0.14, 18: 0.14}
for alpha 1000:{0: 0.05, 3: 0.13, 6: 0.14, 9: 0.14, 12: 0.14, 15: 0.14, 18: 0.14}
for alpha 2500:{0: 0.06, 3: 0.14, 6: 0.14, 9: 0.14, 12: 0.14, 15: 0.14, 18: 0.14}
for alpha 5000:{0: 0.07, 3: 0.14, 6: 0.14, 9: 0.14, 12: 0.14, 15: 0.14, 18: 0.14}
for alpha 10000:{0: 0.09, 3: 0.14, 6: 0.14, 9: 0.14, 12: 0.14, 15: 0.14, 18: 0.14}


Looking at the above two features, the best alpha was found to be alpha=100.

In [64]:
#response encoding features
alpha=100
X_train_re = X_train.copy()
X_val_re = X_val.copy()
X_test_re = X_test.copy()
for col in cat_col:
  response_encoding(X_train_re,X_val_re,X_test_re,col,alpha=alpha)

X_train_re.shape,X_val_re.shape,X_test_re.shape

((68418, 149), (7602, 149), (75818, 148))