In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.pipeline import make_pipeline, Pipeline, make_union, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

%pip install eli5
import eli5

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

RANDOM_SEED = 989

Note: you may need to restart the kernel to use updated packages.
/kaggle/input/nlpword2vecembeddingspretrained/glove.6B.200d.txt
/kaggle/input/nlpword2vecembeddingspretrained/glove.6B.50d.txt
/kaggle/input/nlpword2vecembeddingspretrained/glove.6B.300d.txt
/kaggle/input/nlpword2vecembeddingspretrained/GoogleNews-vectors-negative300.bin
/kaggle/input/nlpword2vecembeddingspretrained/glove.6B.100d.txt
/kaggle/input/emailsendtime/email_best_send_time_sample_submission.csv
/kaggle/input/emailsendtime/email_best_send_time_test.csv
/kaggle/input/emailsendtime/email_best_send_time_train.csv


## Utils

In [14]:
def get_column_names_from_ColumnTransformer(column_transformer):  
    col_name = []

    for transformer_in_columns in column_transformer.transformers_[:-1]: #the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = list(transformer_in_columns[2])

        if isinstance(transformer_in_columns[1], Pipeline): 
            # if pipeline, get the last transformer
            transformer = transformer_in_columns[1].steps[-1][1]
            if isinstance(transformer, StandardScaler):
                transformer = transformer_in_columns[1].steps[-2][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            if isinstance(transformer, OneHotEncoder):
                names = list(transformer.get_feature_names())
            elif isinstance(transformer, SimpleImputer) and transformer.add_indicator:
                missing_indicator_indices = transformer.indicator_.features_
                missing_indicators = [raw_col_name[idx] + '_missing_flag' for idx in missing_indicator_indices]
                names = raw_col_name + missing_indicators
            elif transformer == 'drop':
                names = []
            else:
                names = list(transformer.get_feature_names())
        except AttributeError as error:
            names = raw_col_name

        col_name.extend(names)
            
    return col_name

In [15]:
def get_f1_valid(X, y, model=None, seed=RANDOM_SEED, **kwargs):
    '''
    Cross-validates given model (LogitRegression by default), and returns cv-scores, mean and std
    '''
    kfold = StratifiedKFold(10, shuffle=True, random_state=seed)
    
    if model is None:
        if 'C' not in kwargs:
            kwargs['C'] = 1
        model = LogisticRegression(random_state=seed, solver="liblinear",  **kwargs)

    scores = cross_val_score(model, X, y, cv=kfold, scoring='f1', n_jobs=-1)

    return {
        'scores': scores,
        'mean': scores.mean(),
        'std': scores.std()
    }


def get_weights(X, y, C=1.0, feature_names=None, seed=RANDOM_SEED, **kwargs):
    '''
    trainlogistic regression on the given dataset, and vizualize weights with eli5
    '''
    lr = LogisticRegression(C=C, random_state=seed, solver="liblinear",  **kwargs)

    lr.fit(X, y)
    return eli5.show_weights(lr, feature_names=feature_names)

In [16]:
def gridsearch_lr(X, y, Cs, seed=RANDOM_SEED, **kwargs):
    kfold = StratifiedKFold(10, shuffle=True, random_state=seed)
    lr = LogisticRegressionCV(Cs, random_state=seed, scoring='f1',
                              cv=kfold, solver="liblinear", n_jobs=-1, **kwargs)
    lr.fit(X, y)
    scores = lr.scores_[1][:, lr.Cs_==lr.C_[0]].flatten()

    return {
        'scores': scores,
        'C_': lr.C_[0],
        'Cs': lr.Cs_,
        'mean': scores.mean(),
        'std': scores.std()
    }

In [17]:
TRAIN_PATH = '../input/emailsendtime/email_best_send_time_train.csv'
TEST_PATH = '../input/emailsendtime/email_best_send_time_test.csv'
train_df = pd.read_csv(TRAIN_PATH, index_col='MailID', parse_dates=['SentOn'])
test_df = pd.read_csv(TEST_PATH, index_col='MailID', parse_dates=['SentOn'])
train_df.head()

Unnamed: 0_level_0,Subject,MailBoxID,ContactID,TimeZone,SentOn,Opened
MailID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C278C72C-63D6-4D42-B5A6-455FA2C80D7C,"""Cold emails not converting?""",9F1BAD1D-0370-4200-0045-08D70B884EBA,8977EFF2-DCAD-4E29-B29C-BE55449D24A2,"""(UTC+00:00) Dublin, Edinburgh, Lisbon, London""",2020-09-09 11:34:00,0
F5CA7090-9A73-4197-9923-C094B140C4D1,"""Webnar: Cold emails not converting?""",7FCE4E75-9A26-4EE6-7806-08D6CEFA3BFA,22A565BC-DC52-4CF5-8852-5C343F477F54,"""(UTC-06:00) Central Time (US & Canada)""",2020-09-09 11:35:00,0
B4837C83-9425-4C77-907E-E3B318A5F0F0,"""Cold emails not converting?""",9F1BAD1D-0370-4200-0045-08D70B884EBA,00E3A137-053F-4838-AB6F-4A8F6F52969A,"""(UTC+00:00) Dublin, Edinburgh, Lisbon, London""",2020-09-09 11:38:00,0
204546C3-073D-418E-98BD-E5553B339686,"""Webnar: Cold emails not converting?""",7FCE4E75-9A26-4EE6-7806-08D6CEFA3BFA,9A13D123-10AA-46BF-B5C9-7199074DDD71,"""(UTC-05:00) Eastern Time (US & Canada)""",2020-09-09 11:40:00,0
F28E3C17-0000-4549-9153-5896688E7581,"""Webnar: Cold emails not converting?""",7FCE4E75-9A26-4EE6-7806-08D6CEFA3BFA,FA770B61-2BB3-44EB-A064-0F80CAD8E753,"""(UTC-06:00) Central Time (US & Canada)""",2020-09-09 11:43:00,0


## Transformers

In [18]:
class ResponseExtractor(BaseEstimator, TransformerMixin):
    '''
    Extracts IsResponce featurefrom the email subject (checks if contains 're:')
    '''
    def fit(self, X=None, y=None):
        return self
    
    def transform(self, X, y=None):
        subj = X['Subject'].str.slice(1, -1)  # remove the quotes
        return pd.DataFrame({'IsResponse': subj.apply(lambda x: x.lower()[:3] == 're:').astype('int')})
    
    def get_feature_names(self):
        return ['IsResponse']
    
ResponseExtractor().fit_transform(train_df).tail()

Unnamed: 0_level_0,IsResponse
MailID,Unnamed: 1_level_1
2EA3A30E-D8DB-44E3-A8B3-CA7D9BA2633C,1
114E23DD-5386-40FA-B7C8-AFD80628FE4D,1
6EE9027F-E9E5-476C-B1F4-F9A6433B5DDF,0
DE170985-95AB-47B6-A077-B7B4BD6C6FF3,1
980A56CD-3649-4369-9497-A1D44AA73550,0


In [19]:
class TimeZoneExtractor(BaseEstimator, TransformerMixin):
    '''
    Splits original TimeZone column to TimeZone and Region
    '''
    def __init__(self, columns=['TZ', 'Region']):
        self.columns = columns
        super().__init__()
    
    def fit(self, X=None, y=None):
        return self
    
    def transform(self, X, y=None):
        df = X['TimeZone'].str.slice(1, -1).str.split(n=1, expand=True)
        df.columns = ['TZ', 'Region']
        return df[self.columns]
    

TimeZoneExtractor().fit_transform(train_df).head()

Unnamed: 0_level_0,TZ,Region
MailID,Unnamed: 1_level_1,Unnamed: 2_level_1
C278C72C-63D6-4D42-B5A6-455FA2C80D7C,(UTC+00:00),"Dublin, Edinburgh, Lisbon, London"
F5CA7090-9A73-4197-9923-C094B140C4D1,(UTC-06:00),Central Time (US & Canada)
B4837C83-9425-4C77-907E-E3B318A5F0F0,(UTC+00:00),"Dublin, Edinburgh, Lisbon, London"
204546C3-073D-418E-98BD-E5553B339686,(UTC-05:00),Eastern Time (US & Canada)
F28E3C17-0000-4549-9153-5896688E7581,(UTC-06:00),Central Time (US & Canada)


In [20]:
class RecipientTimeExtractor(BaseEstimator, TransformerMixin):
    '''
    Returns the recipient time (Send time with respect to time zone): hour and/or daytime
    '''
    
    def __init__(self, hour=True, daytime=False):
        self.hour = hour
        self.daytime = daytime
        self.columns = []
        if hour:
            self.columns.append('LocalHour')
        if daytime:
            self.columns += ['Morning', 'Day', 'Evening','Night']
        super().__init__()

    def fit(self, X, y=None):
        self.imp_value = X['TimeZone'].mode()[0]
        return self
    
    def transform(self, X, y=None):
        time = X['TimeZone'].fillna(self.imp_value).str.slice(5, 11) + ':00'
        time = pd.to_timedelta(time)
        
        time = pd.DataFrame({'LocalHour':(X['SentOn'] + time).dt.hour})
        hour = time['LocalHour']
        time['Morning'] = ((hour >= 7) & (hour <= 11)).astype('int')
        time['Day'] = ((hour >= 12) & (hour <= 18)).astype('int')
        time['Evening'] = ((hour >= 19) & (hour <= 23)).astype('int')
        time['Night'] = ((hour >= 0) & (hour <= 6)).astype('int')
        return time[self.columns]
    
    def get_feature_names(self):
        return self.columns
    

RecipientTimeExtractor().fit_transform(train_df).head()

Unnamed: 0_level_0,LocalHour
MailID,Unnamed: 1_level_1
C278C72C-63D6-4D42-B5A6-455FA2C80D7C,11
F5CA7090-9A73-4197-9923-C094B140C4D1,5
B4837C83-9425-4C77-907E-E3B318A5F0F0,11
204546C3-073D-418E-98BD-E5553B339686,6
F28E3C17-0000-4549-9153-5896688E7581,5


In [21]:
class WeekdayExtractor(BaseEstimator, TransformerMixin):
    '''
    Extracts day names from the SentOn column
    '''

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        day = X['SentOn'].dt.day_name()
        return pd.DataFrame({'Weekday':day})


WeekdayExtractor().fit_transform(train_df).head()

Unnamed: 0_level_0,Weekday
MailID,Unnamed: 1_level_1
C278C72C-63D6-4D42-B5A6-455FA2C80D7C,Wednesday
F5CA7090-9A73-4197-9923-C094B140C4D1,Wednesday
B4837C83-9425-4C77-907E-E3B318A5F0F0,Wednesday
204546C3-073D-418E-98BD-E5553B339686,Wednesday
F28E3C17-0000-4549-9153-5896688E7581,Wednesday


In [22]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    '''
    Prepares texts(email subjects) to vectorizing
    Removes non-letters, stop words, converts to lowercase and stems if choosen
    '''

    def __init__(self, stem=True):
        self.stops = set(stopwords.words("english"))
        self.stem = stem
        if stem:
            self.stemmer = PorterStemmer()
        super().__init__()

    def fit(self, X=None, y=None):
        return self

    def transform(self, X, y=None):
        '''
        Notice that output is 1d, because it is required by vectorizers
        '''
        return X['Subject'].apply(self.preprocess_text)

    def preprocess_text(self, raw_text):
        # Remove non-letters
        letters_only = re.sub(r"[^a-zA-Z]", " ", raw_text)
        # Convert to lower case, split into individual words
        words = letters_only.lower().split()
        # Remove stop words (and stem others if needed)
        if self.stem:
            meaningful_words = [self.stemmer.stem(w) for w in words if w not in self.stops]
        else:
            meaningful_words = [w for w in words if w not in self.stops]
        return(" ".join(meaningful_words))
    
TextPreprocessor().fit_transform(train_df).head()

MailID
C278C72C-63D6-4D42-B5A6-455FA2C80D7C           cold email convert
F5CA7090-9A73-4197-9923-C094B140C4D1    webnar cold email convert
B4837C83-9425-4C77-907E-E3B318A5F0F0           cold email convert
204546C3-073D-418E-98BD-E5553B339686    webnar cold email convert
F28E3C17-0000-4549-9153-5896688E7581    webnar cold email convert
Name: Subject, dtype: object

In [23]:
class TextLenExtractor(BaseEstimator, TransformerMixin):
    '''
    Extracts the length of the subject
    '''
    def fit(self, X=None, y=None):
        return self

    def transform(self, X, y=None):
        return pd.DataFrame({'Length': X['Subject'].str.len()})
    
    def get_feature_names(self):
        return ['Length']
    
TextLenExtractor().fit_transform(train_df).head()

Unnamed: 0_level_0,Length
MailID,Unnamed: 1_level_1
C278C72C-63D6-4D42-B5A6-455FA2C80D7C,29
F5CA7090-9A73-4197-9923-C094B140C4D1,37
B4837C83-9425-4C77-907E-E3B318A5F0F0,29
204546C3-073D-418E-98BD-E5553B339686,37
F28E3C17-0000-4549-9153-5896688E7581,37


In [24]:
class YearMonthExtractor(BaseEstimator, TransformerMixin):
    '''
    Extracts the number of months (current year*12 + current month)
    '''
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        y_m = pd.DataFrame(index=X.index)
        y_m['year'] = X["SentOn"].dt.year.astype("int")
        y_m['month'] = X["SentOn"].dt.month.astype("int")
        y_m['yyyymm'] = 12*y_m['year'] + y_m['month']
        return y_m[['yyyymm']]


YearMonthExtractor().fit_transform(train_df).head()

Unnamed: 0_level_0,yyyymm
MailID,Unnamed: 1_level_1
C278C72C-63D6-4D42-B5A6-455FA2C80D7C,24249
F5CA7090-9A73-4197-9923-C094B140C4D1,24249
B4837C83-9425-4C77-907E-E3B318A5F0F0,24249
204546C3-073D-418E-98BD-E5553B339686,24249
F28E3C17-0000-4549-9153-5896688E7581,24249


## Pipeline and model fitting

In [27]:
%%time
local_hour_extractor = make_pipeline(
    RecipientTimeExtractor(hour=True, daytime=False),
    StandardScaler()
)

local_daytime_extractor = make_pipeline(
    RecipientTimeExtractor(hour=False, daytime=True)
)

subject_len_extractor = make_pipeline(
    TextLenExtractor(),
    StandardScaler()
)

subject_vectorizer = make_pipeline(
    TextPreprocessor(),
    CountVectorizer(ngram_range=(1, 2), max_features=15000)
)

region_encoder = make_pipeline(
    TimeZoneExtractor([
#         'TZ',
        'Region'
    ]),
    OneHotEncoder(drop='first')
)

weekday_encoder = make_pipeline(
    WeekdayExtractor(),
    OneHotEncoder(drop='first')
)

year_month_extractor = make_pipeline(
    YearMonthExtractor(),
    StandardScaler()
)

transformer = make_column_transformer(
    [OneHotEncoder(handle_unknown='ignore'), ['MailBoxID']],
    ['drop', ['ContactID']],
    [region_encoder, ['TimeZone']],
    [weekday_encoder, ['SentOn']],
    [local_hour_extractor, ['TimeZone', 'SentOn']],
    [local_daytime_extractor, ['TimeZone', 'SentOn']],
    [year_month_extractor,  ['SentOn']],
    [subject_len_extractor, ['Subject']],
    [ResponseExtractor(), ['Subject']],
    [subject_vectorizer, ['Subject']]
)

X_train = transformer.fit_transform(train_df)
y_train = train_df['Opened']

get_f1_valid(X_train, y_train, class_weight='balanced', C=35.93813663804626)

CPU times: user 8.97 s, sys: 407 ms, total: 9.37 s
Wall time: 57 s


{'scores': array([0.54595261, 0.53099805, 0.53832491, 0.54640451, 0.54247556,
        0.54200764, 0.54941095, 0.54515327, 0.53979144, 0.54862435]),
 'mean': 0.5429143292591001,
 'std': 0.00524464757529268}

LogReg class_weight='balanced', C=35.93813663804626 (without dropping first in OHE)

| Features   |      Mean CV f1      | std CV f1 |
|----------|:-------------:|----------:|
| Submission1: OHE MailBox, Reg, Weekday, local hour, local daytime, response, CountVect ngram(1,2) | 0.5347132956436609 | 0.004051454233162246 |
| +subj len |  0.5369672088484312 | 0.004912771414957262
| +subj len & BoW 15k features| 0.5411631028056966 | 0.004628249991601052
|w2v instead of BoW |  0.47833332191961225| 0.005458159684642388

In [None]:
# %%time
# rez = gridsearch_lr(X_train, y_train, np.logspace(-0.5, 1.6, 6), class_weight='balanced')
# rez

In [28]:
from sklearn.feature_selection import SelectPercentile, f_regression
selector = SelectPercentile(percentile=75)
X_reduced = selector.fit_transform(X_train, y_train)
X_reduced.shape

(100144, 11337)

In [29]:
get_f1_valid(X_reduced, y_train, class_weight='balanced', C=5.754399373371572) #0.5630948155250408

{'scores': array([0.56448957, 0.55370061, 0.56102901, 0.563147  , 0.56475117,
        0.55900451, 0.57162668, 0.56634483, 0.55982316, 0.56696795]),
 'mean': 0.5630884490920425,
 'std': 0.004722294304236503}

In [30]:
# Submission
# ohe mailbox, region, weekday, local hour, local daytime, year*12 + month, subj len, re, subj countvect (1,2) 15k
# + select best f 0.75
logit = LogisticRegression(C=5.754399373371572, random_state=RANDOM_SEED, solver="liblinear", class_weight='balanced')
logit.fit(X_train, y_train)
test_df['Opened'] = np.nan
X_test = transformer.transform(test_df)
test_df['Opened'] = logit.predict(X_test)
test_df[['Opened']].to_csv('submission.csv') # LB: 0.54618 

In [31]:
eli5.show_weights(logit, feature_names=get_column_names_from_ColumnTransformer(transformer))

Weight?,Feature
+4.368,bill time
+4.151,sandra begin
+4.054,hi ebenez
+3.844,greg lightn
+3.774,tom trust
+3.641,juli salesforc
+3.583,melodi lightn
+3.566,jame time
+3.547,see lori
+3.515,inbox maureen
