### Winning solution of the Data Science Olympics 2019
### Running this notebook will give you a leaderboard score of  0.44057
### Tested on Python 2.7, but should also work on Python 3.x
### If you have any questions, don't hesitate to contact me at 
### romain.ayres@gmail.com

In [2]:
import pandas as pd
import numpy as np
import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

np.random.seed(1337)

# Load data

In [3]:
# Load train
train = pd.read_csv(filepath_or_buffer='data/train_requests.csv',
                    sep=',',
                    low_memory=False,
                    error_bad_lines=False,
                    index_col=0)
# Load test
test = pd.read_csv(filepath_or_buffer='data/test_requests.csv',
                   sep=',',
                   low_memory=False,
                   error_bad_lines=False,
                   index_col=0)

# Rename target
train.rename(columns={'granted_number_of_nights': 'target'},
             inplace=True)

# Create target in test df before train+test concatenation
test['target'] = np.nan

# Create train/test variable before train+test concatenation
train['train'] = 1
test['train'] = 0

Skipping line 31303: expected 24 fields, saw 49
Skipping line 75954: expected 24 fields, saw 49



# Feature engineering

In [4]:
# concatenation train+test
df = pd.concat((train[train.columns], test[train.columns]), axis=0)

In [5]:
# list categorical features
categorical_features = ['animal_presence',
                        'child_to_come',
                        'group_composition_label',
                        'group_id',
                        'group_main_requester_id',
                        'group_type',
                        'housing_situation_label',
                        'long_term_housing_request',
                        'request_backoffice_creator_id',
                        'requester_type',
                        'social_situation_id',
                        'town',
                        'victim_of_violence',
                        'victim_of_violence_type',
                        'district']

In [6]:
# label encoding
for var in categorical_features:
    encoder = LabelEncoder()
    df['le_{}'.format(var)] = encoder.fit_transform(df[var])
    df.loc[df[var].isnull(), 'le_{}'.format(var)] = -1

In [7]:
# value counts
for var in categorical_features:
    mapping_vc = df[var].value_counts()
    df['vc_{}'.format(var)] = df[var].map(mapping_vc)

In [8]:
# date features
def create_date_features(df, column):
    df[column] = pd.to_datetime(df[column])
    df['day_{}'.format(column)] = df[column].dt.day
    df['week_{}'.format(column)] = df[column].dt.week
    df['month_{}'.format(column)] = df[column].dt.month
    df['year_{}'.format(column)] = df[column].dt.year
    df['hour_{}'.format(column)] = df[column].dt.hour
    df['weekday_{}'.format(column)] = df[column].dt.weekday
    df['numeric_{}'.format(column)] = df[column].astype(np.int64) * 1e-9
    return df

for date_col in ['answer_creation_date',
                 'group_creation_date',
                 'request_creation_date']:
    df[date_col] = create_date_features(df, date_col)
    df.drop(date_col, axis=1, inplace=True)

df['diff_date_1'] = (df['numeric_request_creation_date'] - \
                     df['numeric_group_creation_date'])
df['diff_date_2'] = (df['numeric_answer_creation_date'] - \
                     df['numeric_group_creation_date'])
df['diff_date_3'] = (df['numeric_answer_creation_date'] - \
                     df['numeric_request_creation_date'])

In [9]:
# target encoding
def target_encoding(df, grp_col, target_col, n_folds=10):
    df['fold'] = np.random.randint(n_folds, size=len(df))
    df_te = pd.DataFrame()
    for fold in df['fold'].unique():
        df_fold = df[df['fold'] != fold].groupby(grp_col)[target_col].agg(['mean', 'count'])
        df_fold['fold'] = fold
        df_te = df_te.append(df_fold.reset_index())
    df_te.rename(columns={'mean': 'te_mean_{}'.format('_'.join(grp_col)),
                          'count': 'te_count_{}'.format('_'.join(grp_col))},
                 inplace=True)
    df = pd.merge(df, df_te, how='left')
    df.drop('fold', axis=1, inplace=True)
    return df

te_grp_cols = [['group_id'],
               ['group_main_requester_id'],
               ['request_backoffice_creator_id'],
               ['social_situation_id'],
               ['housing_situation_id'],
               ['district'],
               ['town'],
               ['social_situation_id', 'group_main_requester_id'],
               ['social_situation_id', 'group_id'],
               ['social_situation_id', 'request_backoffice_creator_id'],
               ['group_main_requester_id', 'group_id'],
               ['group_main_requester_id', 'request_backoffice_creator_id'],
               ['group_id', 'request_backoffice_creator_id'],
               ['social_situation_id', 'housing_situation_id'],
               ['group_main_requester_id', 'housing_situation_id'],
               ['group_id', 'housing_situation_id'],
               ['request_backoffice_creator_id', 'housing_situation_id']]

for grp_col in te_grp_cols:
    df = target_encoding(df=df,
                         grp_col=grp_col,
                         target_col='target',
                         n_folds=10)

# Create X & X_test

In [10]:
dtypes = df.dtypes.map(str)
numerical_features = list(dtypes[dtypes.isin(['int64','float64'])].index)
X_full = df[numerical_features].copy()

X = X_full[X_full['train'] == 1].copy()
X_test = X_full[X_full['train'] == 0].copy()
y = X_full[X_full['train'] == 1]['target']

X.drop(['target', 'train'], axis=1, inplace=True)
X_test.drop(['target', 'train'], axis=1, inplace=True)

# Fit

In [11]:
lgb = LGBMClassifier(nthread=8,
                     objective='logloss',
                     n_estimators=400,
                     num_leaves=45,
                     learning_rate=0.03,
                     subsample=1.0,
                     subsample_freq=1,
                     colsample_bytree=0.3,
                     min_child_samples=10)

lgb.fit(X, y, sample_weight=10**y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
        importance_type='split', learning_rate=0.03, max_depth=-1,
        min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=400, n_jobs=-1, nthread=8, num_leaves=45,
        objective='logloss', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)

# Predict

In [12]:
preds = lgb.predict_proba(X_test)

df_pred = pd.concat([pd.DataFrame(test.index),
                     pd.DataFrame(preds)],
                    axis=1)

# Submit

In [13]:
df_pred.to_csv('data/submit.csv', index=False)