In [1]:
import pandas as pd
import numpy as np

In [2]:
numeric = "../input_orig/train_numeric.csv"
date = "../input_orig/train_date.csv"
categorical = "../input_orig/train_categorical.csv"

numeric_te = "../input_orig/test_numeric.csv"
date_te = "../input_orig/test_date.csv"
categorical_te = "../input_orig/test_categorical.csv"

In [3]:
def orgainize(features):
    line_features = {}
    station_features = {}
    lines = set([f.split('_')[0] for f in features])
    stations = set([f.split('_')[1] for f in features])
    
    for l in lines:
        line_features[l] = [f for f in features if l+'_' in f]
        
    for s in stations:
        station_features[s] = [f for f in features if s+'_' in f]
            
    return line_features, station_features

In [4]:
features_num = pd.read_csv(numeric, nrows=1).drop(['Response', 'Id'], axis=1).columns.values
line_features_num, station_features_num = orgainize(features_num)
print("Features in Station 32: {}".format( station_features_num['S32'] ))

Features in Station 32: ['L3_S32_F3850']


In [5]:
features_date = pd.read_csv(date, nrows=1).drop(['Id'], axis=1).columns.values
line_features_date, station_features_date = orgainize(features_date)
print("Features in Station 32: {}".format( station_features_date['S32'] ))

Features in Station 32: ['L3_S32_D3852']


In [6]:
features_cat = pd.read_csv(categorical, nrows=1).drop(['Id'], axis=1).columns.values
line_features_cat, station_features_cat = orgainize(features_cat)
print("Features in Station 32: {}".format( station_features_cat['S32'] ))

Features in Station 32: ['L3_S32_F3851', 'L3_S32_F3853', 'L3_S32_F3854']


In [7]:
CHUNKSIZE = 50000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_num(df):
    new_features = pd.DataFrame({})
    for s in station_features_num.keys():
        station_data = df[station_features_num[s]]
        col = s+'_nonNaN_num'
        new_features[col] = station_data.count(axis=1).astype(np.int16)
    return new_features

tr_station_nonNaN_num = pd.DataFrame()
for tr in pd.read_csv(numeric, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN, TARGET_COLUMN])
    chunk_features = make_features_num(tr[feats])
    tr_station_nonNaN_num = tr_station_nonNaN_num.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_station_nonNaN_num = pd.DataFrame()
for te in pd.read_csv(numeric_te, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(te.columns, [ID_COLUMN, TARGET_COLUMN])
    chunk_features = make_features_num(te[feats])
    te_station_nonNaN_num = te_station_nonNaN_num.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [8]:
tr_station_nonNaN_num.head()

Unnamed: 0,S36_nonNaN_num,S16_nonNaN_num,S19_nonNaN_num,S23_nonNaN_num,S33_nonNaN_num,S7_nonNaN_num,S2_nonNaN_num,S48_nonNaN_num,S1_nonNaN_num,S49_nonNaN_num,...,S25_nonNaN_num,S40_nonNaN_num,S31_nonNaN_num,S8_nonNaN_num,S0_nonNaN_num,S45_nonNaN_num,S38_nonNaN_num,S32_nonNaN_num,S6_nonNaN_num,S35_nonNaN_num
0,0,0,0,0,10,3,9,0,2,0,...,0,0,4,3,12,0,0,0,0,8
1,0,2,0,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
2,0,0,0,0,10,0,9,0,2,0,...,0,0,0,3,12,0,0,0,3,8
3,8,0,0,0,10,3,9,0,2,0,...,0,0,0,3,12,0,0,0,0,0
4,8,0,0,0,10,3,0,0,2,0,...,0,0,0,3,12,0,0,0,0,0


In [9]:
te_station_nonNaN_num.head()

Unnamed: 0,S36_nonNaN_num,S16_nonNaN_num,S19_nonNaN_num,S23_nonNaN_num,S33_nonNaN_num,S7_nonNaN_num,S2_nonNaN_num,S48_nonNaN_num,S1_nonNaN_num,S49_nonNaN_num,...,S25_nonNaN_num,S40_nonNaN_num,S31_nonNaN_num,S8_nonNaN_num,S0_nonNaN_num,S45_nonNaN_num,S38_nonNaN_num,S32_nonNaN_num,S6_nonNaN_num,S35_nonNaN_num
0,0,0,0,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
1,8,0,0,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
3,8,0,0,0,10,3,9,0,2,0,...,0,0,0,3,12,0,0,0,0,0
4,0,0,0,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8


In [10]:
te_station_nonNaN_num.shape, tr_station_nonNaN_num.shape

((1183748, 50), (1183747, 50))

In [11]:
CHUNKSIZE = 50000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_date(df):
    new_features = pd.DataFrame({})
    for s in station_features_date.keys():
        station_data = df[station_features_date[s]]
        col = s+'_nonNaN_date'
        new_features[col] = station_data.count(axis=1).astype(np.int16)
    return new_features

tr_station_nonNaN_date = pd.DataFrame()
for tr in pd.read_csv(date, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN])
    chunk_features = make_features_date(tr[feats])
    tr_station_nonNaN_date = tr_station_nonNaN_date.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_station_nonNaN_date = pd.DataFrame()
for te in pd.read_csv(date_te, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(te.columns, [ID_COLUMN])
    chunk_features = make_features_date(te[feats])
    te_station_nonNaN_date = te_station_nonNaN_date.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [12]:
tr_station_nonNaN_date.head()

Unnamed: 0,S36_nonNaN_date,S16_nonNaN_date,S19_nonNaN_date,S23_nonNaN_date,S33_nonNaN_date,S7_nonNaN_date,S2_nonNaN_date,S48_nonNaN_date,S1_nonNaN_date,S49_nonNaN_date,...,S40_nonNaN_date,S31_nonNaN_date,S8_nonNaN_date,S0_nonNaN_date,S45_nonNaN_date,S38_nonNaN_date,S32_nonNaN_date,S42_nonNaN_date,S6_nonNaN_date,S35_nonNaN_date
0,0,0,0,0,10,5,9,0,2,0,...,0,4,4,12,0,0,0,0,0,8
1,0,2,0,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
2,0,0,0,0,10,0,9,0,2,0,...,0,0,4,12,0,0,0,0,5,8
3,8,0,0,0,10,5,9,0,2,0,...,0,0,4,12,0,0,0,0,0,0
4,8,0,0,0,10,5,0,0,2,0,...,0,0,4,12,0,0,0,0,0,0


In [13]:
te_station_nonNaN_date.head()

Unnamed: 0,S36_nonNaN_date,S16_nonNaN_date,S19_nonNaN_date,S23_nonNaN_date,S33_nonNaN_date,S7_nonNaN_date,S2_nonNaN_date,S48_nonNaN_date,S1_nonNaN_date,S49_nonNaN_date,...,S40_nonNaN_date,S31_nonNaN_date,S8_nonNaN_date,S0_nonNaN_date,S45_nonNaN_date,S38_nonNaN_date,S32_nonNaN_date,S42_nonNaN_date,S6_nonNaN_date,S35_nonNaN_date
0,0,0,0,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
1,8,0,0,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
3,8,0,0,0,10,5,9,0,2,0,...,0,0,4,12,0,0,0,0,0,0
4,0,0,0,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8


In [14]:
tr_station_nonNaN_date.shape, te_station_nonNaN_date.shape

((1183747, 52), (1183748, 52))

In [15]:
import warnings
warnings.filterwarnings('ignore')

CHUNKSIZE = 10000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_cat(df):
    new_features = pd.DataFrame({})
    for s in station_features_cat.keys():
        station_data = df[station_features_cat[s]]
        col = s+'_nonNaN_cat'
        new_features[col] = station_data.count(axis=1).astype(np.int16)
    return new_features

tr_station_nonNaN_cat = pd.DataFrame()
for tr in pd.read_csv(categorical, chunksize=CHUNKSIZE):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN])
    chunk_features = make_features_cat(tr[feats])
    tr_station_nonNaN_cat = tr_station_nonNaN_cat.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_station_nonNaN_cat = pd.DataFrame()
for te in pd.read_csv(categorical_te, chunksize=CHUNKSIZE):
    feats = np.setdiff1d(te.columns, [ID_COLUMN])
    chunk_features = make_features_cat(te[feats])
    te_station_nonNaN_cat = te_station_nonNaN_cat.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [16]:
tr_station_nonNaN_cat.head()

Unnamed: 0,S36_nonNaN_cat,S16_nonNaN_cat,S32_nonNaN_cat,S10_nonNaN_cat,S23_nonNaN_cat,S26_nonNaN_cat,S2_nonNaN_cat,S44_nonNaN_cat,S22_nonNaN_cat,S47_nonNaN_cat,...,S46_nonNaN_cat,S14_nonNaN_cat,S21_nonNaN_cat,S29_nonNaN_cat,S3_nonNaN_cat,S15_nonNaN_cat,S31_nonNaN_cat,S42_nonNaN_cat,S6_nonNaN_cat,S35_nonNaN_cat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,63,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,63,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,63,0,0,0,0,0,0


In [17]:
te_station_nonNaN_cat.head()

Unnamed: 0,S36_nonNaN_cat,S16_nonNaN_cat,S32_nonNaN_cat,S10_nonNaN_cat,S23_nonNaN_cat,S26_nonNaN_cat,S2_nonNaN_cat,S44_nonNaN_cat,S22_nonNaN_cat,S47_nonNaN_cat,...,S46_nonNaN_cat,S14_nonNaN_cat,S21_nonNaN_cat,S29_nonNaN_cat,S3_nonNaN_cat,S15_nonNaN_cat,S31_nonNaN_cat,S42_nonNaN_cat,S6_nonNaN_cat,S35_nonNaN_cat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,63,0,0,0,0,0,0
2,0,0,0,0,0,27,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,27,0,0,0,0,...,0,0,0,63,0,0,0,0,0,0
4,0,0,0,0,0,27,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
tr_station_nonNaN_cat.shape, te_station_nonNaN_cat.shape

((1183747, 34), (1183748, 34))

In [19]:
tr_station_nonNaN = pd.concat([tr_station_nonNaN_num,tr_station_nonNaN_date,tr_station_nonNaN_cat],axis=1)
te_station_nonNaN = pd.concat([te_station_nonNaN_num,te_station_nonNaN_date,te_station_nonNaN_cat],axis=1)

In [20]:
#tr_station_nonNaN.to_hdf('tr_station_nonNaN6.hdf5','table')
#te_station_nonNaN.to_hdf('te_station_nonNaN6.hdf5','table')

In [22]:
CHUNKSIZE = 50000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_num(df):
    new_features = pd.DataFrame({})
    for s in line_features_num.keys():
        line_data = df[line_features_num[s]]
        col = s+'_nonNaN_num'
        new_features[col] = line_data.count(axis=1).astype(np.int16)
    return new_features

tr_line_nonNaN_num = pd.DataFrame()
for tr in pd.read_csv(numeric, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN, TARGET_COLUMN])
    chunk_features = make_features_num(tr[feats])
    tr_line_nonNaN_num = tr_line_nonNaN_num.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_line_nonNaN_num = pd.DataFrame()
for te in pd.read_csv(numeric_te, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(te.columns, [ID_COLUMN, TARGET_COLUMN])
    chunk_features = make_features_num(te[feats])
    te_line_nonNaN_num = te_line_nonNaN_num.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [23]:
CHUNKSIZE = 50000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_date(df):
    new_features = pd.DataFrame({})
    for s in line_features_date.keys():
        line_data = df[line_features_date[s]]
        col = s+'_nonNaN_date'
        new_features[col] = line_data.count(axis=1).astype(np.int16)
    return new_features

tr_line_nonNaN_date = pd.DataFrame()
for tr in pd.read_csv(date, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN])
    chunk_features = make_features_date(tr[feats])
    tr_line_nonNaN_date = tr_line_nonNaN_date.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_line_nonNaN_date = pd.DataFrame()
for te in pd.read_csv(date_te, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(te.columns, [ID_COLUMN])
    chunk_features = make_features_date(te[feats])
    te_line_nonNaN_date = te_line_nonNaN_date.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [24]:
import warnings
warnings.filterwarnings('ignore')

CHUNKSIZE = 10000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_cat(df):
    new_features = pd.DataFrame({})
    for s in line_features_cat.keys():
        line_data = df[line_features_cat[s]]
        col = s+'_nonNaN_cat'
        new_features[col] = line_data.count(axis=1).astype(np.int16)
    return new_features

tr_line_nonNaN_cat = pd.DataFrame()
for tr in pd.read_csv(categorical, chunksize=CHUNKSIZE):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN])
    chunk_features = make_features_cat(tr[feats])
    tr_line_nonNaN_cat = tr_line_nonNaN_cat.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_line_nonNaN_cat = pd.DataFrame()
for te in pd.read_csv(categorical_te, chunksize=CHUNKSIZE):
    feats = np.setdiff1d(te.columns, [ID_COLUMN])
    chunk_features = make_features_cat(te[feats])
    te_line_nonNaN_cat = te_line_nonNaN_cat.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [25]:
tr_line_nonNaN = pd.concat([tr_line_nonNaN_num,tr_line_nonNaN_date,tr_line_nonNaN_cat],axis=1)
te_line_nonNaN = pd.concat([te_line_nonNaN_num,te_line_nonNaN_date,te_line_nonNaN_cat],axis=1)

In [None]:
#tr_line_nonNaN.to_hdf('tr_line_nonNaN6.hdf5','table')
#te_line_nonNaN.to_hdf('te_line_nonNaN6.hdf5','table')

In [26]:
tr_line_station_nonNaN = pd.concat([tr_station_nonNaN,tr_line_nonNaN],axis=1)
te_line_station_nonNaN = pd.concat([te_station_nonNaN,te_line_nonNaN],axis=1)

In [27]:
tr_line_station_nonNaN.to_hdf('tr_line_station_nonNaN6.hdf5','table')
te_line_station_nonNaN.to_hdf('te_line_station_nonNaN6.hdf5','table')

In [28]:
tr_line_station_nonNaN.head()

Unnamed: 0,S36_nonNaN_num,S16_nonNaN_num,S19_nonNaN_num,S23_nonNaN_num,S33_nonNaN_num,S7_nonNaN_num,S2_nonNaN_num,S48_nonNaN_num,S1_nonNaN_num,S49_nonNaN_num,...,L2_nonNaN_num,L3_nonNaN_num,L1_nonNaN_date,L0_nonNaN_date,L2_nonNaN_date,L3_nonNaN_date,L1_nonNaN_cat,L0_nonNaN_cat,L2_nonNaN_cat,L3_nonNaN_cat
0,0,0,0,0,10,3,9,0,2,0,...,0,119,0,47,0,132,0,0,0,0
1,0,2,0,0,10,0,0,0,0,0,...,0,147,0,49,0,160,0,0,0,0
2,0,0,0,0,10,0,9,0,2,0,...,0,147,0,47,0,160,0,0,0,63
3,8,0,0,0,10,3,9,0,2,0,...,0,147,0,47,0,160,0,0,0,63
4,8,0,0,0,10,3,0,0,2,0,...,0,147,0,47,0,160,0,0,0,63
