# AMEX

## Libraries and functions

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from scipy.stats import zscore
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', None)
import warnings 
warnings.filterwarnings("ignore")
#Generic libraryies
import random
import math
import itertools

#Time and monitoring libraries
import time
from tqdm import tqdm
import pyprind

#Visual libraries
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)
import matplotlib.dates as mdates
import seaborn as sns

#Processing data
from sklearn.preprocessing import RobustScaler , MinMaxScaler , power_transform
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.impute import MissingIndicator

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import sys
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-parquet/test_data.parquet
/kaggle/input/amex-parquet/train_data.parquet


In [2]:
def scaler(df, features):
    df[features] = power_transform(df[features].values)
    return df
    
def missingSW(df, features, drop = False):
    imp = MissingIndicator()
    names = [ f"{j}M" for j in features ]
    df[names] = imp.fit_transform(df[features])
    for j in names:
        df[j] = df[j].apply(lambda x : 1 if x == True else 0)
    if drop:
        df = df.drop(features, axis = 1)
    return df

def imputerLR(df, features, seed = 95):
    imp_mean = IterativeImputer(random_state=seed)
    names = [f"{x}MLr" for x in features]
    imp_mean.fit(df)
    df[names] = imp_mean.transform(df[features])
    df = df.drop(features, axis = 1)
    return df

## Load data and group features

In [3]:
categorical = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
target = "target"
dates = "S_2"
seed = 95

# We will use parquet format of the dataset created by @odins0n for some data exploration. Parquet is fater and save the dtypes of each columns when we read and write.
%time train = pd.read_parquet("/kaggle/input/amex-parquet/train_data.parquet")
print()
train.info()

CPU times: user 10.4 s, sys: 13.5 s, total: 23.9 s
Wall time: 38.3 s

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5531451 entries, 0 to 5531450
Columns: 191 entries, customer_ID to target
dtypes: float32(185), int64(2), object(4)
memory usage: 4.1+ GB


In [4]:
aux = pd.DataFrame(train.isna().mean().sort_values(ascending = False), columns = ["nans"] ).reset_index()

hugeMissing = aux[aux.nans > 0.95]['index'].values
alotMissing = aux[ (aux.nans > 0.25) & (aux.nans <= 0.95) ]['index'].values
lessMissing = aux[aux.nans <= 0.25]['index'].values

del aux
gc.collect()

0

## Featuring engeneering

* Huge missing values -> More than 95% drop column
* A lot of missing values -> Sw columns with 1 or 0 (more than 25% nans values and less than 95%) then drop column
* Scaler -> Power transformer for the numerical features. 
* Missing values -> Less than (25%) impute with LR, because LR is robuster agains overfitting.
* Group customers and extract the most information possible 
    1. Take duration max - min dates
    2. Min, Max, mean, std, most frequent, etc. of the features
    3. Rolling values, taking -7 days, -1 month, -3 month, 6-month
    4. 

In [5]:
#1 
train.drop(hugeMissing, axis = 1, inplace = True)
gc.collect()

21

In [6]:
# 2
train = missingSW(train, alotMissing , True)
gc.collect()

21

In [7]:
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,B_6,B_7,B_8,D_51,B_9,R_3,D_52,P_3,B_10,S_5,B_11,S_6,D_54,R_4,S_7,B_12,S_8,D_55,B_13,R_5,D_58,B_14,D_59,D_60,D_61,B_15,S_11,D_62,D_63,D_64,D_65,B_16,B_18,B_19,B_20,D_68,S_12,R_6,S_13,B_21,D_69,B_22,D_70,D_71,D_72,S_15,B_23,P_4,D_74,D_75,B_24,R_7,B_25,B_26,D_78,D_79,R_8,S_16,D_80,R_10,R_11,B_27,D_81,S_17,R_12,B_28,R_13,D_83,R_14,R_15,D_84,R_16,B_30,S_18,D_86,R_17,R_18,B_31,S_19,R_19,B_32,S_20,R_20,R_21,B_33,D_89,R_22,R_23,D_91,D_92,D_93,D_94,R_24,R_25,D_96,S_22,S_23,S_24,S_25,S_26,D_102,D_103,D_104,D_107,B_36,B_37,R_27,B_38,D_109,D_112,B_40,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_123,D_124,D_125,D_126,D_127,D_128,D_129,B_41,D_130,D_131,D_133,R_28,D_139,D_140,D_141,D_143,D_144,D_145,target,R_9M,B_29M,D_106M,D_132M,D_49M,R_26M,D_76M,D_66M,D_42M,D_142M,D_53M,D_82M,D_50M,B_17M,D_105M,D_56M,S_9M,D_77M,D_43M,S_27M
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,0.00063,0.080986,0.708906,0.1706,0.006204,0.358587,0.525351,0.255736,0.063902,0.059416,0.006466,1.335856,0.008207,0.001423,0.207334,0.736463,0.096219,0.023381,0.002768,0.008322,1.001519,0.008298,0.161345,0.148266,0.922998,0.354596,0.118075,0.001882,0.158612,0.018385,0.063646,0.199617,0.308233,0.016361,0.401619,0.091071,CR,O,0.007126,0.007665,0.652984,0.00852,0.00473,6.0,0.272008,0.008363,0.515222,0.002644,0.009013,0.004808,0.008342,0.119403,0.004802,0.108271,0.050882,0.007554,0.080422,0.069067,0.004327,0.007562,0.007729,0.000272,0.001576,0.004239,0.001434,0.002271,0.004061,0.007121,0.002456,0.00231,0.003532,0.008033,1.009825,0.084683,0.00382,0.007043,0.000438,0.006452,0.00083,0.005055,0.0,0.00572,0.007084,0.000198,0.008907,1,0.002537,0.005177,0.006626,0.009705,0.007782,0.00245,1.001101,0.002665,0.007479,0.006893,1.503673,1.006133,0.003569,0.008871,0.00395,0.003647,0.00495,0.89409,0.135561,0.911191,0.974539,0.001243,0.766688,1.008691,1.004587,0.670041,0.009968,0.004572,1.008949,2.0,0.004326,1.007336,0.21006,0.007871,1.0,0.23825,0.0,4.0,0.23212,0.236266,0.0,0.70228,0.434345,0.003057,0.686516,0.00874,1.0,1.003319,1.007819,1.00008,0.006805,0.002052,0.005972,0.004345,0.001535,0.002427,0.003706,0.003818,0.000569,0.00061,0.002674,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,1,1,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,0.002526,0.069419,0.712795,0.113239,0.006206,0.35363,0.521311,0.223329,0.065261,0.057744,0.001614,1.339794,0.008373,0.001984,0.202778,0.720886,0.099804,0.030599,0.002749,0.002482,1.009033,0.005136,0.140951,0.14353,0.919414,0.326757,0.118737,0.00161,0.148459,0.013035,0.065501,0.151387,0.265026,0.017688,0.406326,0.086805,CR,O,0.002413,0.007148,0.647093,0.002238,0.003879,6.0,0.18897,0.00403,0.509048,0.004193,0.007842,0.001283,0.006524,0.140611,9.4e-05,0.101018,0.040469,0.004832,0.081413,0.074166,0.004203,0.005304,0.001864,0.000979,0.009896,0.007597,0.000509,0.00981,0.000127,0.005966,0.000395,0.001327,0.007773,0.00076,1.009461,0.081843,0.000347,0.007789,0.004311,0.002332,0.009469,0.003753,0.0,0.007584,0.006677,0.001142,0.005907,1,0.008427,0.008979,0.001854,0.009924,0.005987,0.002247,1.006779,0.002508,0.006827,0.002837,1.503577,1.005791,0.000571,0.000391,0.008351,0.00885,0.00318,0.902135,0.136333,0.919876,0.975625,0.004561,0.786007,1.000084,1.004118,0.668647,0.003921,0.004654,1.003205,2.0,0.008707,1.007653,0.184093,0.003444,1.0,0.247217,0.0,4.0,0.243532,0.241885,0.0,0.707017,0.430501,0.001306,0.686414,0.000755,1.0,1.008394,1.004333,1.008344,0.004407,0.001034,0.004838,0.007495,0.004931,0.003954,0.003167,0.005032,0.009576,0.005492,0.009217,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,1,1,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,0.007605,0.068839,0.720884,0.060492,0.003259,0.33465,0.524568,0.189424,0.066982,0.056647,0.005126,1.337179,0.009355,0.007426,0.206629,0.738044,0.134073,0.048367,0.010077,0.00053,1.009184,0.006961,0.112229,0.137014,1.001977,0.304124,0.114534,0.006328,0.139504,0.056653,0.070607,0.305883,0.212165,0.063955,0.406768,0.094001,CR,O,0.001878,0.003636,0.645819,0.000408,0.004578,6.0,0.495308,0.006838,0.679257,0.001337,0.006025,0.009393,0.002615,0.075868,0.007152,0.103239,0.047454,0.006561,0.078891,0.07651,0.001782,0.001422,0.005419,0.006149,0.009629,0.003094,0.008295,0.009362,0.000954,0.005447,0.007345,0.007624,0.008811,0.004056,1.004291,0.081954,0.002709,0.004093,0.007139,0.008358,0.002325,0.007381,0.0,0.005901,0.001185,0.008013,0.008882,1,0.007327,0.002016,0.008686,0.008446,0.007291,0.007794,1.001014,0.009634,0.00982,0.00508,1.503359,1.005801,0.007425,0.009234,0.002471,0.009769,0.005433,0.939654,0.134938,0.958699,0.974067,0.011736,0.80684,1.003014,1.009285,0.670901,0.001264,0.019176,1.000754,2.0,0.004092,1.004312,0.154837,0.003269,1.0,0.239867,0.0,4.0,0.240768,0.23971,0.0,0.704843,0.434409,0.003954,0.690101,0.009617,1.0,1.009307,1.007831,1.006878,0.003221,0.005681,0.005497,0.009227,0.009123,0.003269,0.007329,0.000427,0.003429,0.006986,0.002603,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,1,1,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,0.006406,0.05563,0.723997,0.166782,0.009918,0.323271,0.530929,0.135586,0.08372,0.049253,0.001418,1.339909,0.006782,0.003515,0.208214,0.741813,0.134437,0.030063,0.009667,0.000783,1.007455,0.008706,0.102838,0.129017,0.704016,0.275055,0.12074,0.00498,0.1381,0.012498,0.065926,0.273553,0.2043,0.022732,0.405175,0.094854,CR,O,0.005899,0.005896,0.654358,0.005897,0.005207,6.0,0.50867,0.008183,0.515282,0.008716,0.005271,0.004554,0.002052,0.150209,0.005364,0.206394,0.031705,0.009559,0.07749,0.071547,0.005595,0.006363,0.000646,0.009193,0.008568,0.003895,0.005153,0.004876,0.005665,0.001888,0.004961,3.4e-05,0.004652,0.006969,1.004728,0.060634,0.009982,0.008817,0.00869,0.007364,0.005924,0.008802,0.0,0.00252,0.003324,0.009455,0.008348,1,0.007053,0.003909,0.002478,0.006614,0.009977,0.007686,1.002775,0.007791,0.000458,0.00732,1.503701,1.007036,0.000664,0.0032,0.008507,0.004858,6.3e-05,0.913205,0.140058,0.926341,0.975499,0.007571,0.808214,1.001517,1.004514,0.67262,0.002729,0.01172,1.005338,2.0,0.009703,1.002538,0.153939,5.3e-05,1.0,0.24091,0.0,4.0,0.2394,0.240727,0.0,0.711546,0.436903,0.005135,0.687779,0.004649,1.0,1.001671,1.00346,1.007573,0.007703,0.007108,0.008261,0.007206,0.002409,0.006117,0.004516,0.0032,0.008419,0.006527,0.0096,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,1,1,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,0.007731,0.038862,0.720619,0.14363,0.006667,0.231009,0.529305,,0.0759,0.048918,0.001199,1.341735,0.000519,0.001362,0.205468,0.691986,0.121518,0.054221,0.009484,0.006698,1.003738,0.003846,0.094311,0.129539,0.917133,0.23111,0.095178,0.001653,0.126443,0.027897,0.063697,0.233103,0.175655,0.031171,0.48746,0.093915,CR,O,0.009479,0.001714,0.650112,0.007773,0.005851,6.0,0.216507,0.008605,0.507712,0.006821,0.000152,0.000104,0.001419,0.096441,0.007972,0.10602,0.032733,0.008156,0.076561,0.074432,0.004933,0.004831,0.001833,0.005738,0.003289,0.002608,0.007338,0.007447,0.004465,0.006111,0.002246,0.002109,0.001141,0.00177,1.000904,0.062492,0.00586,0.001845,0.007816,0.00247,0.005516,0.007166,0.0,0.000155,0.001504,0.002019,0.002678,1,0.007728,0.003432,0.002199,0.005511,0.004105,0.009656,1.006536,0.005158,0.003341,0.000264,1.509905,1.002915,0.003079,0.003845,0.00719,0.002983,0.000535,0.921026,0.13162,0.933479,0.978027,0.0182,0.822281,1.006125,1.005735,0.673869,0.009998,0.017598,1.003175,2.0,0.00912,1.00013,0.120717,0.008724,1.0,0.247939,0.0,4.0,0.244199,0.242325,0.0,0.705343,0.437433,0.002849,0.688774,9.7e-05,1.0,1.009886,1.005053,1.008132,0.009823,0.00968,0.004848,0.006312,0.004462,0.003671,0.004946,0.008889,0.00167,0.008126,0.009827,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,1,1,0
