In [1]:
import pandas as pd
import numpy as np

In [2]:
numeric = "../input_orig/train_numeric.csv"
date = "../input_orig/train_date.csv"
categorical = "../input_orig/train_categorical.csv"

numeric_te = "../input_orig/test_numeric.csv"
date_te = "../input_orig/test_date.csv"
categorical_te = "../input_orig/test_categorical.csv"

In [3]:
def orgainize(features):
    line_features = {}
    station_features = {}
    lines = set([f.split('_')[0] for f in features])
    stations = set([f.split('_')[1] for f in features])
    
    for l in lines:
        line_features[l] = [f for f in features if l+'_' in f]
        
    for s in stations:
        station_features[s] = [f for f in features if s+'_' in f]
            
    return line_features, station_features

In [4]:
features_num = pd.read_csv(numeric, nrows=1).drop(['Response', 'Id'], axis=1).columns.values
line_features_num, station_features_num = orgainize(features_num)
print("Features in Station 32: {}".format( station_features_num['S32'] ))

Features in Station 32: ['L3_S32_F3850']


In [5]:
features_date = pd.read_csv(date, nrows=1).drop(['Id'], axis=1).columns.values
line_features_date, station_features_date = orgainize(features_date)
print("Features in Station 32: {}".format( station_features_date['S32'] ))

Features in Station 32: ['L3_S32_D3852']


In [6]:
features_cat = pd.read_csv(categorical, nrows=1).drop(['Id'], axis=1).columns.values
line_features_cat, station_features_cat = orgainize(features_cat)
print("Features in Station 32: {}".format( station_features_cat['S32'] ))

Features in Station 32: ['L3_S32_F3851', 'L3_S32_F3853', 'L3_S32_F3854']


In [7]:
CHUNKSIZE = 50000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_num(df):
    new_features = pd.DataFrame({})
    for s in station_features_num.keys():
        station_data = df[station_features_num[s]]
        col = s+'_sum_num'
        new_features[col] = station_data.sum(axis=1)
    return new_features

tr_station_sum_num = pd.DataFrame()
for tr in pd.read_csv(numeric, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN, TARGET_COLUMN])
    chunk_features = make_features_num(tr[feats])
    tr_station_sum_num = tr_station_sum_num.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_station_sum_num = pd.DataFrame()
for te in pd.read_csv(numeric_te, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(te.columns, [ID_COLUMN, TARGET_COLUMN])
    chunk_features = make_features_num(te[feats])
    te_station_sum_num = te_station_sum_num.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [8]:
tr_station_sum_num.head()

Unnamed: 0,S10_sum_num,S15_sum_num,S18_sum_num,S45_sum_num,S0_sum_num,S7_sum_num,S40_sum_num,S41_sum_num,S37_sum_num,S48_sum_num,...,S3_sum_num,S20_sum_num,S5_sum_num,S36_sum_num,S34_sum_num,S22_sum_num,S32_sum_num,S35_sum_num,S43_sum_num,S30_sum_num
0,,,,,-0.636,-0.181,,,0.0,,...,,,,,0.0,,,-0.113,,0.14
1,,,0.259,,,,,,0.0,,...,,-0.197,,,0.0,,,-0.009,,1.254
2,0.047,,,,0.352,,,,0.0,,...,,,0.065,,0.0,,,-0.097,,-0.612
3,0.052,,,,0.331,0.223,,,0.0,,...,,,,-0.444,0.0,,,,,0.581
4,,,,,0.64,-0.233,,,0.0,,...,-0.04,,,-0.018,0.0,,,,,-0.236


In [9]:
te_station_sum_num.head()

Unnamed: 0,S10_sum_num,S15_sum_num,S18_sum_num,S45_sum_num,S0_sum_num,S7_sum_num,S40_sum_num,S41_sum_num,S37_sum_num,S48_sum_num,...,S3_sum_num,S20_sum_num,S5_sum_num,S36_sum_num,S34_sum_num,S22_sum_num,S32_sum_num,S35_sum_num,S43_sum_num,S30_sum_num
0,,,,,,,,,0.0,,...,,,,,0.0,,,0.245,,-0.344
1,,,,,,,,,0.0,,...,,,,0.028,0.0,,,,,-0.649
2,,,,,,,,,0.0,,...,,,,,0.0,,,0.084,,0.882
3,-0.025,,,,-0.083,0.356,,,0.0,,...,,,,0.09,0.0,,,,,0.01
4,,,,,,,,,0.0,,...,,,,,0.0,,,0.097,,0.999


In [10]:
te_station_sum_num.shape, tr_station_sum_num.shape

((1183748, 50), (1183747, 50))

In [11]:
CHUNKSIZE = 50000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_date(df):
    new_features = pd.DataFrame({})
    for s in station_features_date.keys():
        station_data = df[station_features_date[s]]
        col = s+'_sum_date'
        new_features[col] = station_data.sum(axis=1)
    return new_features

tr_station_sum_date = pd.DataFrame()
for tr in pd.read_csv(date, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN])
    chunk_features = make_features_date(tr[feats])
    tr_station_sum_date = tr_station_sum_date.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_station_sum_date = pd.DataFrame()
for te in pd.read_csv(date_te, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(te.columns, [ID_COLUMN])
    chunk_features = make_features_date(te[feats])
    te_station_sum_date = te_station_sum_date.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [12]:
tr_station_sum_date.head()

Unnamed: 0,S10_sum_date,S15_sum_date,S18_sum_date,S45_sum_date,S0_sum_date,S7_sum_date,S40_sum_date,S41_sum_date,S37_sum_date,S48_sum_date,...,S20_sum_date,S5_sum_date,S46_sum_date,S36_sum_date,S34_sum_date,S22_sum_date,S32_sum_date,S35_sum_date,S43_sum_date,S30_sum_date
0,,,,,986.879944,411.300018,,,523.73999,,...,,,,,436.399994,,,698.320007,,3141.360107
1,,,6565.75,,,,,,7894.5,,...,5252.600098,,,,6578.75,,,10526.0,,89469.625
2,21043.492188,,,,19424.398438,,,,9746.520508,,...,,3237.439941,,,8122.100098,,,12995.360352,,110459.867188
3,14939.858398,,,,13790.400391,5746.099609,,,6924.960449,,...,,,,9233.280273,5770.75,,,,,78480.851562
4,,,,,7231.680664,3013.349854,,,3636.120117,,...,,,,4848.160156,3030.050049,,,,,41208.0


In [13]:
te_station_sum_date.head()

Unnamed: 0,S10_sum_date,S15_sum_date,S18_sum_date,S45_sum_date,S0_sum_date,S7_sum_date,S40_sum_date,S41_sum_date,S37_sum_date,S48_sum_date,...,S20_sum_date,S5_sum_date,S46_sum_date,S36_sum_date,S34_sum_date,S22_sum_date,S32_sum_date,S35_sum_date,S43_sum_date,S30_sum_date
0,,,,,,,,,4804.200195,,...,,,,,4003.5,,,6405.600098,,54444.882812
1,,,,,,,,,6360.419434,,...,,,,8480.55957,5300.300293,,,,,72082.71875
2,,,,,,,,,4266.47998,,...,,,,,3555.400146,,,5688.640137,,48344.601562
3,3321.370117,,,,3065.399902,1277.400024,,,1537.680054,,...,,,,2050.23999,1281.349976,,,,,9225.0
4,,,,,,,,,4621.680176,,...,,,,,3851.400146,,,6162.240234,,52378.359375


In [14]:
tr_station_sum_date.shape, te_station_sum_date.shape

((1183747, 52), (1183748, 52))

In [15]:
tr_station_sum = pd.concat([tr_station_sum_num,tr_station_sum_date],axis=1)
te_station_sum = pd.concat([te_station_sum_num,te_station_sum_date],axis=1)

In [16]:
#tr_station_sum.to_hdf('tr_station_sum7.hdf5','table')
#te_station_sum.to_hdf('te_station_sum7.hdf5','table')

In [17]:
CHUNKSIZE = 50000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_num(df):
    new_features = pd.DataFrame({})
    for s in line_features_num.keys():
        line_data = df[line_features_num[s]]
        col = s+'_sum_num'
        new_features[col] = line_data.sum(axis=1)
    return new_features

tr_line_sum_num = pd.DataFrame()
for tr in pd.read_csv(numeric, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN, TARGET_COLUMN])
    chunk_features = make_features_num(tr[feats])
    tr_line_sum_num = tr_line_sum_num.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_line_sum_num = pd.DataFrame()
for te in pd.read_csv(numeric_te, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(te.columns, [ID_COLUMN, TARGET_COLUMN])
    chunk_features = make_features_num(te[feats])
    te_line_sum_num = te_line_sum_num.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [18]:
CHUNKSIZE = 50000
NROWS_TR = 1183747
nrows_tr = 0
NROWS_TE = 1183748
nrows_te = 0
ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'
SEED = 0

def make_features_date(df):
    new_features = pd.DataFrame({})
    for s in line_features_date.keys():
        line_data = df[line_features_date[s]]
        col = s+'_sum_date'
        new_features[col] = line_data.sum(axis=1)
    return new_features

tr_line_sum_date = pd.DataFrame()
for tr in pd.read_csv(date, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN])
    chunk_features = make_features_date(tr[feats])
    tr_line_sum_date = tr_line_sum_date.append(chunk_features)
    
    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break
        
te_line_sum_date = pd.DataFrame()
for te in pd.read_csv(date_te, chunksize=CHUNKSIZE, dtype=np.float32):
    feats = np.setdiff1d(te.columns, [ID_COLUMN])
    chunk_features = make_features_date(te[feats])
    te_line_sum_date = te_line_sum_date.append(chunk_features)
    
    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [19]:
tr_line_sum = pd.concat([tr_line_sum_num,tr_line_sum_date],axis=1)
te_line_sum = pd.concat([te_line_sum_num,te_line_sum_date],axis=1)

#tr_line_sum.to_hdf('tr_line_sum7.hdf5','table')
#te_line_sum.to_hdf('te_line_sum7.hdf5','table')

In [20]:
tr_line_station_sum = pd.concat([tr_station_sum,tr_line_sum],axis=1)
te_line_station_sum = pd.concat([te_station_sum,te_line_sum],axis=1)

In [21]:
tr_line_station_sum.to_hdf('feats/tr_line_station_sum7.hdf5','table')
te_line_station_sum.to_hdf('feats/te_line_station_sum7.hdf5','table')

In [22]:
tr_line_station_sum.shape

(1183747, 110)