# Featurization - `Ticket`

1. Remove special characters but not space
2. replace numeric strings by 'numeric'
3. split on space and keep the first item

In [25]:
import pandas as pd

In [26]:
df = pd.read_csv("../../data/raw/train.csv")
dfX = df.drop(['Survived', 'PassengerId'], axis=1)
dfy = df.Survived

In [27]:
df.Ticket

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object

## Function to extract the ticket type

In [33]:
def extract_ticket_type(x):
    '''
    1. Remove special characters but not space
    2. replace numeric strings by 'numeric'
    3. split on space and keep the first item

    '''
    x = x.str.replace(r'[^A-Za-z0-9\s]+', '')
    
    def replace(v):
        if not v.isdigit():
            return v.split(' ')[0]
        else:
            return 'numeric'
        
    x = x.apply(replace)
    
    return pd.DataFrame(x)

In [34]:
ticket_type = extract_ticket_type(df.Ticket).Ticket
ticket_type

0           A5
1           PC
2       STONO2
3      numeric
4      numeric
        ...   
886    numeric
887    numeric
888         WC
889    numeric
890    numeric
Name: Ticket, Length: 891, dtype: object

In [35]:
ticket_type.value_counts()

numeric    661
PC          60
CA          41
A5          21
SOTONOQ     15
STONO       12
WC          10
SCPARIS      7
A4           7
SOC          6
STONO2       6
FCC          5
C            5
SCParis      4
LINE         4
SCAH         3
SOPP         3
WEP          3
PP           3
SWPP         2
SOTONO2      2
PPP          2
AS           1
FC           1
Fa           1
SOP          1
CASOTON      1
SP           1
SCOW         1
SC           1
SCA4         1
Name: Ticket, dtype: int64

In [36]:
from src.utils import group_low_count_cat

In [37]:
group_low_count_cat(ticket_type, 15, 'other').index

Index(['numeric', 'other', 'PC', 'CA', 'A5'], dtype='object')

## Modified function
### So now we can modify our function to group these categories

In [41]:
def extract_ticket_type(x, only_numeric=False, exclude_thresh=None):
    '''
    1. Remove special characters but not space
    2. replace numeric strings by 'numeric'
    3. split on space and keep the first item
    4. Group all categories into 'other' excluding above {exclude_thresh}
    5. Put rest into 'other'
    '''
    x = x.str.replace(r'[^A-Za-z0-9\s]+', '')
    
    def replace(v):
        if not v.isdigit():
            return v.split(' ')[0]
        else:
            return 'numeric'
     
    x = x.apply(replace)
        
    if only_numeric:
        x.loc[~x.isin(['numeric'])] = 'other'
        return pd.DataFrame(x)
    
    elif exclude_thresh is not None:
        if exclude_thresh <= 10:
            cat = ['PC', 'CA', 'A5', 'SOTONOQ', 'STONO']
        elif exclude_thresh <= 12:
            cat = ['PC', 'CA', 'A5', 'SOTONOQ']
        elif exclude_thresh <= 15:
            cat = ['PC', 'CA', 'A5']
            
        x.loc[~x.isin(['numeric'] + cat)] = 'other'    
    
    return pd.DataFrame(x)

### Default behaviour

In [42]:
extract_ticket_type(df.Ticket)

Unnamed: 0,Ticket
0,A5
1,PC
2,STONO2
3,numeric
4,numeric
...,...
886,numeric
887,numeric
888,WC
889,numeric


### only_numeric

In [43]:
extract_ticket_type(df.Ticket, only_numeric=True).Ticket.value_counts()

numeric    661
other      230
Name: Ticket, dtype: int64

### select a threshold

In [44]:
extract_ticket_type(df.Ticket, exclude_thresh=10).Ticket.value_counts()

numeric    661
other       81
PC          60
CA          41
A5          21
SOTONOQ     15
STONO       12
Name: Ticket, dtype: int64

## Custom `Transformer ` for Extracting the Ticket type

In [74]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [75]:
class TicketTypeExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self,  only_numeric=False, exclude_thresh=10, drop='auto', 
                 handle_unknown='prespecify', sparse=False):
        self.exclude_thresh = exclude_thresh
        self.only_numeric = only_numeric
        self.handle_unknown = handle_unknown
        self.drop = drop
        self.sparse = sparse
        
    @staticmethod
    def replace(v):
        if not v.isdigit():
            return v.split(' ')[0]
        else:
            return 'numeric'
        
        
    def fit(self, X, y=None):
            
        if self.only_numeric:
            self.cat = ['numeric', 'other']
        else:
            if self.exclude_thresh <= 10:
                self.exclude_cat = ['PC', 'CA', 'A5', 'SOTONOQ', 'STONO']
            elif self.exclude_thresh <= 12:
                self.exclude_cat = ['PC', 'CA', 'A5', 'SOTONOQ']
            elif self.exclude_thresh <= 15:
                self.exclude_cat = ['PC', 'CA', 'A5']
                                
            self.cat = self.exclude_cat + ['numeric', 'other']
            
        try:    
            name = X.name 
        except AttributeError:
            name = 'Ticket'
            
            
        dummy_df = pd.DataFrame({name: ['numeric']})
        
        self.ohe = OneHotEncoder(categories=[self.cat], drop=['other']).fit(dummy_df)
        
        return self
    
    def transform(self, X):
        X = X.apply(self.replace)
        
        if self.only_numeric:
            X.loc[~X.isin(['numeric'])] = 'other'

        X.loc[~X.isin(['numeric'] + self.cat)] = 'other'
    
        
        return self.ohe.fit_transform(pd.DataFrame(X))
    
    def get_feature_names(self, input_features=None):
        return self.ohe.get_feature_names(input_features)
    
    

In [76]:
from sklearn_pandas import DataFrameMapper

In [77]:
dfm = DataFrameMapper([('Ticket', TicketTypeExtractor())], input_df=True, df_out=True)
dfm.fit(df).transform(pd.DataFrame({'Ticket': ['numeric']}))

Unnamed: 0,Ticket_x0_PC,Ticket_x0_CA,Ticket_x0_A5,Ticket_x0_SOTONOQ,Ticket_x0_STONO,Ticket_x0_numeric
0,0.0,0.0,0.0,0.0,0.0,1.0


In [78]:
TicketTypeExtractor(only_numeric=False).fit(df['Ticket']).transform(pd.Series(['numeric']))

<1x6 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [79]:
TicketTypeExtractor().fit_transform(df.Ticket)

<891x6 sparse matrix of type '<class 'numpy.float64'>'
	with 727 stored elements in Compressed Sparse Row format>

In [81]:
make_column_transformer((TicketTypeExtractor(), 'Ticket')).fit_transform(df)

<891x6 sparse matrix of type '<class 'numpy.float64'>'
	with 727 stored elements in Compressed Sparse Row format>