# Featurization - `Cabin`

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../../data/raw/train.csv")
dfX = df.drop(['Survived', 'PassengerId'], axis=1)
dfy = df.Survived

In [3]:
df.Cabin.isna()

0       True
1      False
2       True
3      False
4       True
       ...  
886     True
887    False
888     True
889    False
890     True
Name: Cabin, Length: 891, dtype: bool

In [4]:
def extract_first_letter(x):
    
    '''
    Extracts the first letter of the cabin and fills NaN as 'NC' (No cabin)
    '''
    
    col1 = x.str[0].fillna('NC')
    
    return pd.DataFrame(col1)

In [5]:
dfX.loc[df.Cabin == 'T']

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
339,1,"Blackwell, Mr. Stephen Weart",male,45.0,0,0,113784,35.5,T,S


Need to make a function which groups such that the minimum count is `thresh`

We can include these steps in identifying the categorical groups while making the pipeline. 
Something like a helper pipeline to make a preprocessing pipeline

In [13]:
def get_low_count_groups(x, thresh, return_new_grps=False, setval=None, sort=False):
    '''
    Get groups such that the aggregate count is atleast {thresh}
    '''
    counts = x.value_counts().sort_values(ascending=True)
    gt_thresh = counts.cumsum() >= thresh
    first_occur = gt_thresh.searchsorted(True)
    
    if not return_new_grps:
        res = counts.index[:(first_occur)].tolist()
        if sort:
            res.sort()
    else:
        res = counts.index[(first_occur+1):].tolist()
        if sort:
            res = sorted(res) + ['other']
            
    return res
    

In [22]:
cabin_type = extract_first_letter(df.Cabin).Cabin
cabin_type.value_counts()

NC    687
C      59
B      47
D      33
E      32
A      15
F      13
G       4
T       1
Name: Cabin, dtype: int64

In [23]:
get_low_count_groups(cabin_type, 10)

['T', 'G']

In [24]:
get_low_count_groups(cabin_type, 10, True, 'other', True)

['A', 'B', 'C', 'D', 'E', 'NC', 'other']

In [19]:
from src.utils import group_low_count_cat

In [27]:
sorted(group_low_count_cat(cabin_type, 10, 'other').index.tolist())

['A', 'B', 'C', 'D', 'E', 'NC', 'other']

Now we can remake the original `extract_first_letter` or make the `FE_Cabin` transformer by chaining in a `Pipeline`

In [17]:
def extract_first_letter_Cabin(x, grp_cat=False):
    
    '''
    Extracts the first letter of the cabin and fills NaN as 'NC' (No cabin)
    Parameters:
    -----------
    grp_cat: bool, default: False
        whether to group some categories to 'other'
        Encode with categories ['A', 'B', 'C', 'D', 'E', 'NC', 'other']
        
    '''
    
    x = x.str[0].fillna('NC')
    if grp_cat:
        x.loc[~x.isin(['A', 'B', 'C', 'D', 'E', 'NC'])] = 'other'
    
    return pd.DataFrame(x)
  

In [73]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [82]:
class CabinTypeExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, grp_cat=True, sparse=False):
        self.grp_cat = grp_cat
        
        if self.grp_cat:
            self.exclude_cat = ['A', 'B', 'C', 'D', 'E', 'NC']
            self.cat = self.exclude_cat + ['other']
            self.ohe = OneHotEncoder([self.cat], drop=['other'], sparse=sparse)
        else:
            self.cat = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'NC']
            self.ohe = OneHotEncoder([self.cat], drop=['T'], sparse=sparse)
            
        self.sparse = sparse
        
    def fit(self, X, y=None):
                    
        try:    
            name = X.name 
        except AttributeError:
            name = 'Cabin'
            
        dummy_df = pd.DataFrame({name: ['A']})
        
        self.ohe.fit(dummy_df)
        
        return self
    
    
    def transform(self, X):
        X = X.str[0].fillna('NC')
        if self.grp_cat:
            X.loc[~X.isin(self.exclude_cat)] = 'other'

        return self.ohe.transform(pd.DataFrame(X))
    
    def get_feature_names(self, input_features):
        return self.ohe.get_feature_names(input_features)



In [83]:
CabinTypeExtractor(grp_cat=True).fit_transform(df.Cabin)

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [84]:
CabinTypeExtractor().fit(df.Cabin).get_feature_names(['Cabin'])

array(['Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_NC'],
      dtype=object)

In [85]:
make_column_transformer((CabinTypeExtractor(), 'Cabin')).fit_transform(df)

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [18]:
extract_first_letter_Cabin(df.Cabin, True)

Unnamed: 0,Cabin
0,NC
1,C
2,NC
3,C
4,NC
...,...
886,NC
887,B
888,NC
889,C


In [None]:
def group_low_count_cat(x, thresh, setval):

    '''Group categories with counts less than {thresh},
    and rename them to {setval}

    Parameters
    ----------
    
    x: pd.Series
        A pandas series with discrete entries
        
    thresh: int
        Minimum count of a low count category

    setval: object
        New value to set for the merged categories

    '''
    counts = x.value_counts()

    cat_grp = counts[counts <= thresh].index.tolist()

    counts = counts[::-1] 

    gt_thresh = counts.cumsum() >= thresh

    first_occur = gt_thresh.searchsorted(True)

    add_cat = gt_thresh.index[first_occur]

    cat_grp = cat_grp + [add_cat]

    def replace(v):
        if v in cat_grp:
            return setval
        else:
            return v

    return pd.DataFrame(x.apply(replace).value_counts())

x = get_title(df.Name).Name
group_low_count_cat(x, 10, 'other')