In [1]:
import pandas as pd
import numpy as np

In [2]:
numeric = "../input_orig/train_numeric.csv"
date = "../input_orig/train_date.csv"
categorical = "../input_orig/train_categorical.csv"

numeric_te = "../input_orig/test_numeric.csv"
date_te = "../input_orig/test_date.csv"
categorical_te = "../input_orig/test_categorical.csv"

In [3]:
def orgainize(features):
    line_features = {}
    station_features = {}
    lines = set([f.split('_')[0] for f in features])
    stations = set([f.split('_')[1] for f in features])
    
    for l in lines:
        line_features[l] = [f for f in features if l+'_' in f]
        
    for s in stations:
        station_features[s] = [f for f in features if s+'_' in f]
            
    return line_features, station_features

In [4]:
features_num = pd.read_csv(numeric, nrows=1).drop(['Response', 'Id'], axis=1).columns.values
line_features_num, station_features_num = orgainize(features_num)
print("Features in Station 32: {}".format( station_features_num['S32'] ))

Features in Station 32: ['L3_S32_F3850']


In [5]:
features_date = pd.read_csv(date, nrows=1).drop(['Id'], axis=1).columns.values
line_features_date, station_features_date = orgainize(features_date)
print("Features in Station 32: {}".format( station_features_date['S32'] ))

Features in Station 32: ['L3_S32_D3852']


In [6]:
features_cat = pd.read_csv(categorical, nrows=1).drop(['Id'], axis=1).columns.values
line_features_cat, station_features_cat = orgainize(features_cat)
print("Features in Station 32: {}".format( station_features_cat['S32'] ))

Features in Station 32: ['L3_S32_F3851', 'L3_S32_F3853', 'L3_S32_F3854']


In [7]:
CHUNKSIZE = 50000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_num(df):
    new_features = pd.DataFrame({})
    for s in station_features_num.keys():
        station_data = df[station_features_num[s]]
        col = s+'_pass_num'
        new_features[col] = ((station_data.count(axis=1))>0).astype(np.int8)
    return new_features

tr_station_pass_num = pd.DataFrame()
for tr in pd.read_csv(numeric, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN, TARGET_COLUMN])
    chunk_features = make_features_num(tr[feats])
    tr_station_pass_num = tr_station_pass_num.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_station_pass_num = pd.DataFrame()
for te in pd.read_csv(numeric_te, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(te.columns, [ID_COLUMN, TARGET_COLUMN])
    chunk_features = make_features_num(te[feats])
    te_station_pass_num = te_station_pass_num.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [8]:
tr_station_pass_num.head()

Unnamed: 0,S36_pass_num,S12_pass_num,S3_pass_num,S43_pass_num,S14_pass_num,S44_pass_num,S7_pass_num,S29_pass_num,S41_pass_num,S6_pass_num,...,S21_pass_num,S33_pass_num,S17_pass_num,S18_pass_num,S16_pass_num,S27_pass_num,S34_pass_num,S10_pass_num,S38_pass_num,S40_pass_num
0,0,0,0,0,0,0,1,1,0,0,...,0,1,0,0,0,0,1,0,0,0
1,0,1,0,0,1,0,0,1,0,0,...,1,1,0,1,1,0,1,0,0,0
2,0,0,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,1,1,0,0
3,1,0,0,0,0,0,1,1,0,0,...,0,1,0,0,0,0,1,1,0,0
4,1,0,1,0,0,0,1,1,0,0,...,0,1,0,0,0,0,1,0,0,0


In [9]:
te_station_pass_num.head()

Unnamed: 0,S36_pass_num,S12_pass_num,S3_pass_num,S43_pass_num,S14_pass_num,S44_pass_num,S7_pass_num,S29_pass_num,S41_pass_num,S6_pass_num,...,S21_pass_num,S33_pass_num,S17_pass_num,S18_pass_num,S16_pass_num,S27_pass_num,S34_pass_num,S10_pass_num,S38_pass_num,S40_pass_num
0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,1,0,0,0
1,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,1,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,1,1,0,0,...,0,1,0,0,0,0,1,1,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0


In [10]:
tr_station_pass_num.shape, te_station_pass_num.shape

((1183747, 50), (1183748, 50))

In [11]:
tr_station_pass_num.to_hdf('feats/tr_station_pass_num.hdf5','table')
te_station_pass_num.to_hdf('feats/te_station_pass_num.hdf5','table')

In [12]:
CHUNKSIZE = 50000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_date(df):
    new_features = pd.DataFrame({})
    for s in station_features_date.keys():
        station_data = df[station_features_date[s]]
        col = s+'_pass_date'
        new_features[col] = ((station_data.count(axis=1))>0).astype(np.int8)
    return new_features

tr_station_pass_date = pd.DataFrame()
for tr in pd.read_csv(date, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN])
    chunk_features = make_features_date(tr[feats])
    tr_station_pass_date = tr_station_pass_date.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_station_pass_date = pd.DataFrame()
for te in pd.read_csv(date_te, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(te.columns, [ID_COLUMN])
    chunk_features = make_features_date(te[feats])
    te_station_pass_date = te_station_pass_date.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [13]:
tr_station_pass_date.head()

Unnamed: 0,S36_pass_date,S12_pass_date,S3_pass_date,S43_pass_date,S14_pass_date,S44_pass_date,S7_pass_date,S29_pass_date,S41_pass_date,S6_pass_date,...,S33_pass_date,S17_pass_date,S18_pass_date,S16_pass_date,S42_pass_date,S27_pass_date,S34_pass_date,S10_pass_date,S38_pass_date,S40_pass_date
0,0,0,0,0,0,0,1,1,0,0,...,1,0,0,0,0,0,1,0,0,0
1,0,1,0,0,1,0,0,1,0,0,...,1,0,1,1,0,0,1,0,0,0
2,0,0,0,0,0,0,0,1,0,1,...,1,0,0,0,0,0,1,1,0,0
3,1,0,0,0,0,0,1,1,0,0,...,1,0,0,0,0,0,1,1,0,0
4,1,0,1,0,0,0,1,1,0,0,...,1,0,0,0,0,0,1,0,0,0


In [14]:
te_station_pass_date.head()

Unnamed: 0,S36_pass_date,S12_pass_date,S3_pass_date,S43_pass_date,S14_pass_date,S44_pass_date,S7_pass_date,S29_pass_date,S41_pass_date,S6_pass_date,...,S33_pass_date,S17_pass_date,S18_pass_date,S16_pass_date,S42_pass_date,S27_pass_date,S34_pass_date,S10_pass_date,S38_pass_date,S40_pass_date
0,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,1,0,0,0
1,1,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,1,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,1,1,0,0,...,1,0,0,0,0,0,1,1,0,0
4,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0


In [15]:
tr_station_pass_date.shape, te_station_pass_date.shape

((1183747, 52), (1183748, 52))

In [16]:
tr_station_pass_date.to_hdf('feats/tr_station_pass_date.hdf5','table')
te_station_pass_date.to_hdf('feats/te_station_pass_date.hdf5','table')

In [26]:
import warnings
warnings.filterwarnings('ignore')

CHUNKSIZE = 50000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_cat(df):
    new_features = pd.DataFrame({})
    for s in station_features_cat.keys():
        station_data = df[station_features_cat[s]]
        col = s+'_pass_cat'
        new_features[col] = ((station_data.count(axis=1))>0).astype(np.int8)
    return new_features

tr_station_pass_cat = pd.DataFrame()
for tr in pd.read_csv(categorical, chunksize=CHUNKSIZE):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN])
    chunk_features = make_features_cat(tr[feats])
    tr_station_pass_cat = tr_station_pass_cat.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_station_pass_cat = pd.DataFrame()
for te in pd.read_csv(categorical_te, chunksize=CHUNKSIZE):
    feats = np.setdiff1d(te.columns, [ID_COLUMN])
    chunk_features = make_features_cat(te[feats])
    te_station_pass_cat = te_station_pass_cat.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [27]:
tr_station_pass_cat.head()

Unnamed: 0,S36_pass_cat,S9_pass_cat,S46_pass_cat,S15_pass_cat,S26_pass_cat,S3_pass_cat,S43_pass_cat,S2_pass_cat,S23_pass_cat,S14_pass_cat,...,S18_pass_cat,S30_pass_cat,S16_pass_cat,S42_pass_cat,S27_pass_cat,S25_pass_cat,S39_pass_cat,S32_pass_cat,S38_pass_cat,S11_pass_cat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
te_station_pass_cat.head()

Unnamed: 0,S36_pass_cat,S9_pass_cat,S46_pass_cat,S15_pass_cat,S26_pass_cat,S3_pass_cat,S43_pass_cat,S2_pass_cat,S23_pass_cat,S14_pass_cat,...,S18_pass_cat,S30_pass_cat,S16_pass_cat,S42_pass_cat,S27_pass_cat,S25_pass_cat,S39_pass_cat,S32_pass_cat,S38_pass_cat,S11_pass_cat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
tr_station_pass_cat.shape, te_station_pass_cat.shape

((1183747, 34), (1183748, 34))

In [30]:
tr_station_pass_cat.to_hdf('feats/tr_station_pass_cat.hdf5','table')
te_station_pass_cat.to_hdf('feats/te_station_pass_cat.hdf5','table')

In [None]:
import warnings
#warnings.filterwarnings('ignore')
warnings.filterwarnings('default')

CHUNKSIZE = 100000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_num(df):
    new_features = pd.DataFrame({})
    for s in line_features_num.keys():
        line_data = df[line_features_num[s]]
        col = s+'_pass_num'
        new_features[col] = ((line_data.count(axis=1))>0).astype(np.int8)
    return new_features

tr_line_pass_num = pd.DataFrame()
for tr in pd.read_csv(numeric, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN,TARGET_COLUMN])
    chunk_features = make_features_num(tr[feats])
    tr_line_pass_num = tr_line_pass_num.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_line_pass_num = pd.DataFrame()
for te in pd.read_csv(numeric_te, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(te.columns, [ID_COLUMN,TARGET_COLUMN])
    chunk_features = make_features_num(te[feats])
    te_line_pass_num = te_line_pass_num.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [None]:
tr_line_pass_num.shape, te_line_pass_num.shape

tr_line_pass_num.head()

te_line_pass_num.head()

tr_line_pass_num.to_hdf('feats/tr_line_pass_num.hdf5','table')
te_line_pass_num.to_hdf('feats/te_line_pass_num.hdf5','table')

((1183747, 4), (1183748, 4))

In [13]:
import warnings
#warnings.filterwarnings('ignore')
warnings.filterwarnings('default')

CHUNKSIZE = 100000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_date(df):
    new_features = pd.DataFrame({})
    for s in line_features_date.keys():
        line_data = df[line_features_date[s]]
        col = s+'_pass_date'
        new_features[col] = ((line_data.count(axis=1))>0).astype(np.int8)
    return new_features

tr_line_pass_date = pd.DataFrame()
for tr in pd.read_csv(date, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN])
    chunk_features = make_features_date(tr[feats])
    tr_line_pass_date = tr_line_pass_date.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_line_pass_date = pd.DataFrame()
for te in pd.read_csv(date_te, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(te.columns, [ID_COLUMN])
    chunk_features = make_features_date(te[feats])
    te_line_pass_date = te_line_pass_date.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [14]:
tr_line_pass_date.shape, te_line_pass_date.shape

((1183747, 4), (1183748, 4))

In [15]:
tr_line_pass_date.head()

Unnamed: 0,L0_pass_date,L3_pass_date,L2_pass_date,L1_pass_date
0,1,1,0,0
1,1,1,0,0
2,1,1,0,0
3,1,1,0,0
4,1,1,0,0


In [16]:
te_line_pass_date.head()

Unnamed: 0,L0_pass_date,L3_pass_date,L2_pass_date,L1_pass_date
0,0,1,1,1
1,0,1,1,1
2,0,1,1,1
3,1,1,1,0
4,0,1,1,1


In [17]:
tr_line_pass_date.to_hdf('feats/tr_line_pass_date.hdf5','table')
te_line_pass_date.to_hdf('feats/te_line_pass_date.hdf5','table')

In [3]:
tr_station_pass_num = pd.read_hdf('feats/tr_station_pass_num1.hdf5','table')
tr_station_pass_date = pd.read_hdf('feats/tr_station_pass_date1.hdf5','table')
tr_station_pass_cat = pd.read_hdf('feats/tr_station_pass_cat1.hdf5','table')

te_station_pass_num = pd.read_hdf('feats/te_station_pass_num1.hdf5','table')
te_station_pass_date = pd.read_hdf('feats/te_station_pass_date1.hdf5','table')
te_station_pass_cat = pd.read_hdf('feats/te_station_pass_cat1.hdf5','table')

In [4]:
tr_station_pass = pd.concat([tr_station_pass_num,tr_station_pass_date,tr_station_pass_cat],axis=1)
te_station_pass = pd.concat([te_station_pass_num,te_station_pass_date,te_station_pass_cat],axis=1)

In [10]:
tr_station_pass.to_hdf('feats/tr_station_pass.hdf5','table')
te_station_pass.to_hdf('feats/te_station_pass.hdf5','table')

In [11]:
tr_line_pass_num = pd.read_hdf('feats/tr_line_pass_num2.hdf5','table')
tr_line_pass_date = pd.read_hdf('feats/tr_line_pass_date2.hdf5','table')

te_line_pass_num = pd.read_hdf('feats/te_line_pass_num2.hdf5','table')
te_line_pass_date = pd.read_hdf('feats/te_line_pass_date2.hdf5','table')

In [16]:
tr_line_pass = pd.concat([tr_line_pass_num,tr_line_pass_date],axis=1)
te_line_pass = pd.concat([te_line_pass_num,te_line_pass_date],axis=1)

In [20]:
tr_line_pass.to_hdf('feats/tr_line_pass2.hdf5','table')
te_line_pass.to_hdf('feats/te_line_pass2.hdf5','table')