In [1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('ggplot')
plt.rcParams["font.family"] = "Times New Roman"

In [2]:
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve, auc
from catboost.utils import get_roc_curve
import xgboost as xgb
from typing import List, Tuple
import datetime
from datetime import datetime, date
from tqdm import tqdm_notebook

***********************

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df

In [4]:
train = reduce_mem_usage(pd.read_csv('assignment_2_train.csv'))

Memory usage of dataframe is 541.08 MB
Memory usage after optimization is: 262.48 MB
Decreased by 51.5%


In [5]:
train.head(2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,


In [6]:
train.set_index('TransactionID', inplace=True)

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(train.drop(columns='isFraud'),
                                                      train['isFraud'],
                                                      shuffle=False,
                                                      train_size=0.8
                                                     )

print("X_train:  {} rows, {} cols".format(*X_train.shape))
print("X_valid:  {} rows, {} cols".format(*X_valid.shape))

X_train:  144000 rows, 392 cols
X_valid:  36000 rows, 392 cols


*********************

**Задание 0:** выбрать любую модель машнного обучения и зафиксировать любой тип валидации. Обучить базовую модель и зафиксировать базовое качество модели. В каждом следующем задании нужно будет обучить выбранную модель и оценивать ее качество на зафиксированной схеме валидации. После каждого задания, требуется сделать вывод о достигаемом качестве модели, по сравнению с качестом из предыдущего шага.

In [8]:
train_len = len(X_train)
dataset = pd.concat(objs=[X_train, X_valid], axis=0)
dataset = pd.get_dummies(dataset)
dum_train = dataset[:train_len].copy() # dummies
dum_valid = dataset[train_len:].copy() # dummies

In [9]:
train_xgb = xgb.DMatrix(data=dum_train,
                            label=y_train
                            )

valid_xgb = xgb.DMatrix(data=dum_valid,
                            label=y_valid,
                            )

In [10]:
params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.05,
    "reg_lambda": 100,
    "max_depth": 4,
    "gamma": 10,
    "nthread": -1,
    "seed": 13
}

In [11]:
model_xgb_1 = xgb.train(
    params=params,
    dtrain=train_xgb,
    num_boost_round=500,
    early_stopping_rounds=50,
    evals=[(train_xgb, "train"), (valid_xgb, "valid")],
    verbose_eval=50,
    maximize=True,
)

[0]	train-auc:0.61546	valid-auc:0.62643
[50]	train-auc:0.85591	valid-auc:0.84138
[100]	train-auc:0.88780	valid-auc:0.86141
[150]	train-auc:0.90207	valid-auc:0.87917
[200]	train-auc:0.90865	valid-auc:0.88687
[250]	train-auc:0.91292	valid-auc:0.88940
[300]	train-auc:0.91581	valid-auc:0.89119
[350]	train-auc:0.91804	valid-auc:0.89186
[376]	train-auc:0.91807	valid-auc:0.89181


*********************

**Задание 1:** признак TransactionDT - это смещение в секундах относительно базовой даты. Базовая дата - 2017-12-01, преобразовать признак TransactionDT в datetime, прибавив к базовой дате исходное значение признака. Из полученного признака выделить год, месяц, день недели, час, день.

In [12]:
start_date = int(datetime(2017, 12, 1).timestamp())
print(start_date)

1512075600


In [13]:
def get_date_item(df, date):
    
    df[date] = pd.to_datetime(df[date] + start_date)
    df['Year'] = df[date].dt.year
    df['Month'] = df[date].dt.month
    df['Day_of_week'] = df[date].dt.weekday
    df['Hour'] = df[date].dt.hour
    df['Day'] = df[date].dt.day
    df.drop([date], axis=1, inplace=True)
    
    return df

In [14]:
df_train = get_date_item(dum_train.copy(), "TransactionDT")
df_valid = get_date_item(dum_valid.copy(), "TransactionDT")

In [15]:
train_xgb = xgb.DMatrix(data=df_train,
                            label=y_train
                            )

valid_xgb = xgb.DMatrix(data=df_valid,
                            label=y_valid,
                            )

In [16]:
model_xgb_2 = xgb.train(
    params=params,
    dtrain=train_xgb,
    num_boost_round=500,
    early_stopping_rounds=50,
    evals=[(train_xgb, "train"), (valid_xgb, "valid")],
    verbose_eval=50,
    maximize=True,
)

[0]	train-auc:0.61546	valid-auc:0.62643
[50]	train-auc:0.85591	valid-auc:0.84138
[100]	train-auc:0.88718	valid-auc:0.85897
[150]	train-auc:0.90087	valid-auc:0.87171
[200]	train-auc:0.90736	valid-auc:0.87870
[250]	train-auc:0.91130	valid-auc:0.88137
[300]	train-auc:0.91343	valid-auc:0.88181
[337]	train-auc:0.91356	valid-auc:0.88178


При тех же гиперпараметерах модель показала более низкий результат

****************

**Задание 2:** сделать конкатенацию признаков
* card1 + card2;
* card1 + card2 + card_3 + card_5;
* card1 + card2 + card_3 + card_5 + addr1 + addr2

Рассматривать их как категориальных признаки.

In [17]:
def feature_concatenation(df, features):
    new_name = f'{features[0]}'
    data = df[new_name].astype('str')
    for i in range(1, len(features)):
        new_name += f'_{features[i]}'
        df[features[i]] = df[features[i]].astype('str')
        data += df[features[i]]
    df[new_name] = data
    return df

In [18]:
df_train = feature_concatenation(X_train.copy(), ['card1', 'card2'])
df_valid = feature_concatenation(X_valid.copy(), ['card1', 'card2'])
df_train = feature_concatenation(df_train, ['card1', 'card2', 'card3', 'card5'])
df_valid = feature_concatenation(df_valid, ['card1', 'card2', 'card3', 'card5'])
df_train = feature_concatenation(df_train, ['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2'])
df_valid = feature_concatenation(df_valid, ['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2'])

In [19]:
df_train['target'] = y_train
df_valid['target'] = y_valid

In [20]:
# кодирование средним - иначе MemoryError 
agg_cols = ['card1_card2',
            'card1_card2_card3_card5',
            'card1_card2_card3_card5_addr1_addr2'
           ]
train_len = len(df_train)
dataset = pd.concat(objs=[df_train, df_valid], axis=0)

for col in tqdm_notebook(agg_cols):
    gp = dataset.groupby(col)['target']
    mean = gp.mean()
    dataset[col + '_trg_avg'] = dataset[col].map(mean)


dataset = dataset.drop(agg_cols+['target'], axis=1)

  0%|          | 0/3 [00:00<?, ?it/s]

In [21]:
dataset = pd.get_dummies(dataset)
dum_train = dataset[:train_len].copy() # dummies
dum_valid = dataset[train_len:].copy() # dummies

In [22]:
train_xgb = xgb.DMatrix(data=dum_train,
                            label=y_train
                            )

valid_xgb = xgb.DMatrix(data=dum_valid,
                            label=y_valid,
                            )

In [23]:
model_xgb_3 = xgb.train(
    params=params,
    dtrain=train_xgb,
    num_boost_round=500,
    early_stopping_rounds=50,
    evals=[(train_xgb, "train"), (valid_xgb, "valid")],
    verbose_eval=50,
    maximize=True,
)

[0]	train-auc:0.81324	valid-auc:0.81190
[50]	train-auc:0.95498	valid-auc:0.95008
[100]	train-auc:0.97239	valid-auc:0.96790
[150]	train-auc:0.97494	valid-auc:0.97019
[200]	train-auc:0.97622	valid-auc:0.97104
[250]	train-auc:0.97682	valid-auc:0.97188
[300]	train-auc:0.97703	valid-auc:0.97205
[350]	train-auc:0.97707	valid-auc:0.97208
[400]	train-auc:0.97713	valid-auc:0.97209
[412]	train-auc:0.97713	valid-auc:0.97209


качество модели улучшилось, однако скорее всего модель излишне переобучилась из-за target encoding на большом колчистве nunique категорий.  на XGBoost другие подходы сложновыполнимы. а дамми кодирование выдает  MemoryError

**************************

**Задание 3:** Сделать FrequencyEncoder для признаков card1 - card6, addr1, addr2.

In [24]:
def fe_features(df, features):
    for col in features:
        freq_encoder = df[col].value_counts(normalize=True)
        df[f'{col}_freq_enc'] = df[col].map(freq_encoder)
    return df

In [25]:
features = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6']
df_train = fe_features(X_train.copy(), features)
df_valid = fe_features(X_valid.copy(), features)

In [26]:
train_len = len(df_train)
dataset = pd.concat(objs=[df_train, df_valid], axis=0)
dataset = pd.get_dummies(dataset)
dum_train = dataset[:train_len].copy() # dummies
dum_valid = dataset[train_len:].copy() # dummies

In [27]:
train_xgb = xgb.DMatrix(data=dum_train,
                            label=y_train
                            )

valid_xgb = xgb.DMatrix(data=dum_valid,
                            label=y_valid,
                            )

In [28]:
model_xgb_3 = xgb.train(
    params=params,
    dtrain=train_xgb,
    num_boost_round=500,
    early_stopping_rounds=50,
    evals=[(train_xgb, "train"), (valid_xgb, "valid")],
    verbose_eval=50,
    maximize=True,
)

[0]	train-auc:0.61546	valid-auc:0.62643
[50]	train-auc:0.85620	valid-auc:0.84048
[100]	train-auc:0.88819	valid-auc:0.86083
[150]	train-auc:0.90307	valid-auc:0.87930
[200]	train-auc:0.90935	valid-auc:0.88612
[250]	train-auc:0.91377	valid-auc:0.88872
[300]	train-auc:0.91628	valid-auc:0.88964
[344]	train-auc:0.91628	valid-auc:0.88964


чуть хуже базовой модели

***********************

**Задание 4:** Создать признаки на основе отношения: TransactionAmt к вычисленной статистике. Статистика - среднее значение / стандартное отклонение TransactionAmt, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [29]:
df_train = feature_concatenation(X_train.copy(), ['card1', 'card2'])
df_valid = feature_concatenation(X_valid.copy(), ['card1', 'card2'])
df_train = feature_concatenation(df_train, ['card1', 'card2', 'card3', 'card5'])
df_valid = feature_concatenation(df_valid, ['card1', 'card2', 'card3', 'card5'])
df_train = feature_concatenation(df_train, ['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2'])
df_valid = feature_concatenation(df_valid, ['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2'])

In [30]:
features_agg = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', \
                'card1_card2', 'card1_card2_card3_card5', 'card1_card2_card3_card5_addr1_addr2']
target = 'TransactionAmt'
# ЕСЛИ правильно понята фраза отношение к статистике
for feature in features_agg:
    df_train[f'{feature}_TrAmt_stat'] = df_train[target]/(df_train.groupby(feature)[target].transform('mean') / \
                                        df_train.groupby(feature)[target].transform('std'))

df_train.head(2)

Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,card2_TrAmt_stat,card3_TrAmt_stat,card4_TrAmt_stat,card5_TrAmt_stat,card6_TrAmt_stat,addr1_TrAmt_stat,addr2_TrAmt_stat,card1_card2_TrAmt_stat,card1_card2_card3_card5_TrAmt_stat,card1_card2_card3_card5_addr1_addr2_TrAmt_stat
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,...,117.377016,104.090544,110.702963,53.972697,103.499965,116.430281,103.99344,25.328146,25.328146,
2987001,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,...,43.191368,44.06753,46.904221,44.445038,43.817504,43.826863,44.02642,54.066649,54.066649,35.204208


*****************

**Задание 5:** Создать признаки на основе отношения: D15 к вычисленной статистике. Статистика - среднее значение / стандартное отклонение D15, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [31]:
target = 'D15'

# ЕСЛИ правильно понята фраза отношение к статистике
for feature in features_agg:
    df_train[f'{feature}_TrD15_stat'] = df_train[target]/(df_train.groupby(feature)[target].transform('mean') / \
                                        df_train.groupby(feature)[target].transform('std'))
df_train.head(2)

Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,card2_TrAmt_stat,card3_TrAmt_stat,card4_TrAmt_stat,card5_TrAmt_stat,card6_TrAmt_stat,addr1_TrAmt_stat,addr2_TrAmt_stat,card1_card2_TrAmt_stat,card1_card2_card3_card5_TrAmt_stat,card1_card2_card3_card5_addr1_addr2_TrAmt_stat
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
2987001,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


******************

**Задание 6:** выделить дробную часть и целую часть признака TransactionAmt в два отдельных признака. После создать отдельных признак - логарифм от TransactionAmt

In [32]:
def new_feature(df, target):
    df[f'{target}_int'] = df[target].astype(int)
    df[f'{target}_frac'] = df[target] - df[f'{target}_int']
    df[f'{target}_log'] = np.log(df[target])
    return df

In [33]:
df_train = new_feature(X_train, 'TransactionAmt')
df_train.head(2)

Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,V333,V334,V335,V336,V337,V338,V339,TransactionAmt_int,TransactionAmt_frac,TransactionAmt_log
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,...,,,,,,,,68,0.5,4.226834
2987001,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,...,,,,,,,,29,0.0,3.367296
