In [1]:
import json
import csv

In [23]:
def extract_parentesis(s):
    res = s.split()[1]
    return res[1:-1]

def extract_both(s):
    split = s.split()
    return split[0], split[1][1:-1]

def moment_features(pref, in_dict, out_dict={}):
    ellipse_major, ellipse_minor = in_dict['Ellipse Semi-Major/Minor axis'].split(',')

    out_dict['moments_' + pref + '_ellipse_major'] = float(ellipse_major)
    out_dict['moments_' + pref + '_ellipse_minor'] = float(ellipse_minor)
    out_dict['moments_' + pref + '_ellipse_eccentricity'] = float(in_dict['Ellipse eccentricity'])
    out_dict['moments_' + pref + '_ellipse_intensity'] = float(extract_parentesis(in_dict['Ellipse intensity']))
    out_dict['moments_' + pref + '_ellipse_angle'] = float(in_dict[u'Ellipse angle'])

    centroid_x, centroid_y = in_dict[u'Centroid'].split(',')
    out_dict['moments_' + pref + '_ellipse_centroid_x'] = float(centroid_x)
    out_dict['moments_' + pref + '_ellipse_centroid_y'] = float(centroid_y)

    for i in range(1, 9):
        #v1, v2 = extract_both(in_dict['I%d' % i])
        #out_dict['moments_' + pref + '_I%d_1' % i] = float(v1)
        #out_dict['moments_' + pref + '_I%d_2' % i] = float(v2)
        v2 = extract_parentesis(in_dict['I%d' % i])
        out_dict['moments_' + pref + '_I%d' % i] = float(v2)

def phash_features(in_dict, out_dict={}):
    for k, phdict in in_dict.items():
        k1, k2 = k.lower().split(', ')
        for phk, phv in phdict.items():
            phv1, phv2 = phv.split(', ')
            out_dict['phash_%s_%s' % (k1, phk.lower())] = float(phv1)
            out_dict['phash_%s_%s' % (k2, phk.lower())] = float(phv2)
        
def image_stats_features(pref, in_dict, out_dict={}):
    out_dict['imstat_' + pref + '_skewness'] = float(in_dict['skewness'])
    out_dict['imstat_' + pref + '_min'] = float(extract_parentesis(in_dict['min']))
    out_dict['imstat_' + pref + '_max'] = float(extract_parentesis(in_dict['max']))
    out_dict['imstat_' + pref + '_mean'] = float(extract_parentesis(in_dict['mean']))
    out_dict['imstat_' + pref + '_std'] = float(extract_parentesis(in_dict['standard deviation']))
    out_dict['imstat_' + pref + '_entropy'] = float(in_dict['entropy'])
    out_dict['imstat_' + pref + '_kurtosis'] = float(in_dict['kurtosis'])

def chromaticity_features(in_dict, out_dict={}):
    for k, v in in_dict.items():
        k = k.split()[0]
        v1, v2 = eval(v)
        out_dict['croma_%s_1' % k] = v1
        out_dict['croma_%s_2' % k] = v2    

def number_pixels(s):
    if type(s) in [int, float]:
        return s
    if s.endswith('K'):
        return 1000 * float(s[:-1])

    raise Exception('number_pixels: unknown format' + s)

def file_size(s):
    if s.endswith('KB'):
        return 1024 * float(s[:-2])
    elif s.endswith('B'):
        return float(s[:-1])
    elif s.endswith('MB'):
        return 1024 * 1024 * float(s[:-2])
    raise Exception('file_size: unknown format ' + s)

def process_line(line):
    image_id, features = line.split('\t', maxsplit=1)
    features = json.loads(features)

    res_dict = {'_id': image_id}
    res_dict['alpha_color'] = features['Alpha color']
    res_dict['bg_color'] =  features['Background color']
    
    is_gray = features[u'Type'] in [u'Grayscale', u'Bilevel']
    if is_gray:
        res_dict['color'] = False
        moment_features('overall', features[u'Channel moments']['Gray'], res_dict)
        moment_features('red', features[u'Channel moments']['Gray'], res_dict)
        moment_features('green', features[u'Channel moments']['Gray'], res_dict)
        moment_features('blue', features[u'Channel moments']['Gray'], res_dict)

        image_stats_features('overall', features[u'Channel statistics']['Gray'], res_dict)
        image_stats_features('red', features[u'Channel statistics']['Gray'], res_dict)
        image_stats_features('green', features[u'Channel statistics']['Gray'], res_dict)
        image_stats_features('blue', features[u'Channel statistics']['Gray'], res_dict)
    else:
        res_dict['color'] = True
        moment_features('overall', features[u'Image moments']['Overall'], res_dict)
        moment_features('red', features[u'Channel moments']['Red'], res_dict)
        moment_features('green', features[u'Channel moments']['Green'], res_dict)
        moment_features('blue', features[u'Channel moments']['Blue'], res_dict)

        image_stats_features('overall', features[u'Image statistics']['Overall'], res_dict)
        image_stats_features('red', features[u'Channel statistics']['Red'], res_dict)
        image_stats_features('green', features[u'Channel statistics']['Green'], res_dict)
        image_stats_features('blue', features[u'Channel statistics']['Blue'], res_dict)

    phash_features(features[u'Channel perceptual hash'], res_dict)
    res_dict['no_pixels'] = number_pixels(features[u'Number pixels'])
    res_dict['filesize'] = file_size(features[u'Filesize'])

    res_dict['quality'] = features[u'Quality']
    res_dict['geometry'] = features[u'Geometry'][:-4]
    res_dict['type'] = features[u'Type']
    res_dict['colorspace'] = features[u'Colorspace']

    props = features[u'Properties']
    res_dict['date'] = props[u'date:modify']
    #res_dict['signature'] = props[u'signature']

    return res_dict

In [14]:
header = ['_id', 'alpha_color', 'bg_color', 'color', 'colorspace', 'date', 'filesize', 'geometry', 
          'imstat_blue_entropy', 'imstat_blue_kurtosis', 'imstat_blue_max', 'imstat_blue_mean', 
          'imstat_blue_min', 'imstat_blue_skewness', 'imstat_blue_std', 'imstat_green_entropy',
          'imstat_green_kurtosis', 'imstat_green_max', 'imstat_green_mean', 'imstat_green_min',
          'imstat_green_skewness', 'imstat_green_std', 'imstat_overall_entropy', 'imstat_overall_kurtosis',
          'imstat_overall_max', 'imstat_overall_mean', 'imstat_overall_min', 'imstat_overall_skewness',
          'imstat_overall_std', 'imstat_red_entropy', 'imstat_red_kurtosis', 'imstat_red_max',
          'imstat_red_mean', 'imstat_red_min', 'imstat_red_skewness', 'imstat_red_std', 'moments_blue_I1',
          'moments_blue_I2', 'moments_blue_I3', 'moments_blue_I4', 'moments_blue_I5', 'moments_blue_I6',
          'moments_blue_I7', 'moments_blue_I8', 'moments_blue_ellipse_angle', 'moments_blue_ellipse_centroid_x',
          'moments_blue_ellipse_centroid_y', 'moments_blue_ellipse_eccentricity', 'moments_blue_ellipse_intensity',
          'moments_blue_ellipse_major', 'moments_blue_ellipse_minor', 'moments_green_I1', 'moments_green_I2',
          'moments_green_I3', 'moments_green_I4', 'moments_green_I5', 'moments_green_I6', 'moments_green_I7',
          'moments_green_I8', 'moments_green_ellipse_angle', 'moments_green_ellipse_centroid_x', 
          'moments_green_ellipse_centroid_y', 'moments_green_ellipse_eccentricity', 
          'moments_green_ellipse_intensity', 'moments_green_ellipse_major', 'moments_green_ellipse_minor',
          'moments_overall_I1', 'moments_overall_I2', 'moments_overall_I3', 'moments_overall_I4', 
          'moments_overall_I5', 'moments_overall_I6', 'moments_overall_I7', 'moments_overall_I8', 
          'moments_overall_ellipse_angle', 'moments_overall_ellipse_centroid_x', 
          'moments_overall_ellipse_centroid_y', 'moments_overall_ellipse_eccentricity', 
          'moments_overall_ellipse_intensity', 'moments_overall_ellipse_major', 'moments_overall_ellipse_minor',
          'moments_red_I1', 'moments_red_I2', 'moments_red_I3', 'moments_red_I4', 'moments_red_I5', 
          'moments_red_I6', 'moments_red_I7', 'moments_red_I8', 'moments_red_ellipse_angle', 
          'moments_red_ellipse_centroid_x', 'moments_red_ellipse_centroid_y', 'moments_red_ellipse_eccentricity', 
          'moments_red_ellipse_intensity', 'moments_red_ellipse_major', 'moments_red_ellipse_minor', 
          'no_pixels', 'phash_blue_ph1', 'phash_blue_ph2', 'phash_blue_ph3', 'phash_blue_ph4', 'phash_blue_ph5', 
          'phash_blue_ph6', 'phash_blue_ph7', 'phash_chroma_ph1', 'phash_chroma_ph2', 'phash_chroma_ph3', 
          'phash_chroma_ph4', 'phash_chroma_ph5', 'phash_chroma_ph6', 'phash_chroma_ph7', 'phash_green_ph1',
          'phash_green_ph2', 'phash_green_ph3', 'phash_green_ph4', 'phash_green_ph5', 'phash_green_ph6',
          'phash_green_ph7', 'phash_hue_ph1', 'phash_hue_ph2', 'phash_hue_ph3', 'phash_hue_ph4', 'phash_hue_ph5',
          'phash_hue_ph6', 'phash_hue_ph7', 'phash_luma_ph1', 'phash_luma_ph2', 'phash_luma_ph3', 'phash_luma_ph4',
          'phash_luma_ph5', 'phash_luma_ph6', 'phash_luma_ph7', 'phash_red_ph1', 'phash_red_ph2', 'phash_red_ph3',
          'phash_red_ph4', 'phash_red_ph5', 'phash_red_ph6', 'phash_red_ph7', 'quality', 'type']

In [30]:
csv_res_file = open('imagemagick_features_res.csv', 'w')

writer = csv.DictWriter(csv_res_file, fieldnames=header)

In [31]:
writer.writeheader()

In [32]:
from tqdm import tqdm

In [33]:
with open('imagemagick_features.txt') as f:
    for line in tqdm(f):
        d = process_line(line.strip())
        writer.writerow(d)

692156it [07:34, 1522.07it/s]


In [34]:
csv_res_file.flush()
csv_res_file.close()

In [2]:
import re

import numpy as np
import pandas as pd

import os

from tqdm import tqdm
import gc

from sklearn.utils import shuffle
from sklearn.metrics import log_loss

import feather

In [3]:
df_train = pd.read_json('data/train.json')
df_test = pd.read_json('data/test.json')

In [4]:
def extract_file_name(url):
    return url[url.rfind('/')+1:]

In [5]:
np.random.seed(1)
folds = np.random.choice([0, 1, 2, 3, 4, 5], size=len(df_train))
df_train['fold'] = folds.astype('uint8')

In [6]:
interest_level_map = {'low': 0, 'medium': 1, 'high': 2}
df_train.interest_level = df_train.interest_level.apply(lambda x: interest_level_map.get(x, -1))

In [7]:
images_train = []

for row in df_train.itertuples():
    id = row.listing_id
    fold = row.fold
    lev = row.interest_level
    for p in row.photos:
        path = extract_file_name(p)
        images_train.append((id, path, lev, fold))

In [8]:
df_images_train = pd.DataFrame(images_train, columns=['listing_id', 'image', 'interest_level', 'fold'])
df_images_train.sample(n=5)

Unnamed: 0,listing_id,image,interest_level,fold
136798,7234004,7234004_471899ecee3126b170f68b4555e92f45.jpg,1,0
247061,6947851,6947851_1b323983edc41e22c2d0ab6cc5fcfb77.jpg,1,3
181087,7064283,7064283_a462dcfdb5929c8a00af16eccf9c8aa3.jpg,0,1
7257,6813365,6813365_4ad9176e9ee78ef050bf550f845dcd74.jpg,1,3
33607,6875160,6875160_cf121fe5a5aa0e8adcd4de427b9b2494.jpg,0,5


In [9]:
images_test = []

for row in df_test.itertuples():
    id = row.listing_id
    for p in row.photos:
        path = extract_file_name(p)
        images_test.append((id, path))

In [10]:
df_images_test = pd.DataFrame(images_test, columns=['listing_id', 'image'])
df_images_test.sample(n=5)

Unnamed: 0,listing_id,image
334950,6955855,6955855_df81a971dbecd0afc4bb5e73a2874ba7.jpg
237453,6994781,6994781_ea45afc5fee56ea1af5d8feb30464928.jpg
323950,6981834,6981834_c8d2c7bafa6f3f3ef09cc7074d52710d.jpg
138090,7091800,7091800_c514528233417240c0a93622bf403b6a.jpg
76765,7099109,7099109_9d48aed035b754796cac3e55aa8246a7.jpg


In [11]:
for i in [0, 1, 2]:
    df_images_train['interest_%s' % i] = (df_images_train.interest_level == i).astype('uint8')

In [15]:
dtypes = {'_id': 'O',
     'alpha_color': 'O',
     'bg_color': 'O',
     'color': 'bool',
     'colorspace': 'O',
     'date': 'O',
     'geometry': 'O',
     'type': 'O'}

for t in header:
    if t not in dtypes:
        dtypes[t] = 'float32'

In [16]:
df_im = pd.read_csv('imagemagick_features_res.csv', dtype=dtypes)
del df_im['alpha_color']
del df_im['bg_color']
del df_im['color']
del df_im['colorspace']
del df_im['type']
del df_im['quality']

In [17]:
df_im.date = pd.to_datetime(df_im.date)
from_start = (df_im.date - df_im.date.min())
sec_from_start = from_start / np.timedelta64(1, 's')
df_im['seconds_from_min'] = sec_from_start.astype('float32')
del df_im['date']

In [20]:
df_train = df_images_train.merge(df_im, left_on='image', right_on='_id')
del df_train['_id']
df_test = df_images_test.merge(df_im, left_on='image', right_on='_id')
del df_test['_id']

In [26]:
def fit_mtv(df, target_col, cat_col, C):
    m0 = (df[target_col] == 1).mean()

    cnt = df[df[target_col] == 1][cat_col].value_counts()
    cnt_all = df[cat_col].value_counts()

    probs = (cnt + C * m0) / (cnt_all + C)
    return probs, m0

def transform_mtv(df_target, probs, m0):
    probs_targ = probs[df_target[cat_col]].reset_index(drop=1)
    probs_targ.fillna(m0, inplace=1)
    return probs_targ.values

In [27]:
C = 12
cat_col = 'geometry'

for i in tqdm([0, 1, 2, 3, 4, 5]):
    df_train_fold = df_train[df_train.fold != i].reset_index(drop=1)
    df_test_fold = df_train[df_train.fold == i].reset_index(drop=1)

    for target_col in ['interest_0', 'interest_1', 'interest_2']:
        res_name = '%s_%s_mtv' % (cat_col, target_col)

        probs, m0 = fit_mtv(df_train_fold, target_col, cat_col, C=C)
        df_train.loc[df_train.fold == i, res_name] = transform_mtv(df_test_fold, probs, m0)

100%|██████████| 6/6 [00:02<00:00,  2.90it/s]


In [29]:
C = 12
cat_col = 'geometry'

for target_col in ['interest_0', 'interest_1', 'interest_2']:
    res_name = '%s_%s_mtv' % (cat_col, target_col)

    probs, m0 = fit_mtv(df_train, target_col, cat_col, C=C)
    df_test[res_name] = transform_mtv(df_test, probs, m0)

In [31]:
exclude = {'listing_id', 'image', 'interest_level', 'fold',
           'interest_0', 'interest_1', 'interest_2', 'geometry'}

features = sorted(set(df_train.columns) - exclude)

In [32]:
import xgboost as xgb

In [34]:
xgb_pars = {
    'eta': 0.15,
    'gamma': 0,
    'max_depth': 6,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 0.6,
    'colsample_bytree': 0.6,
    'colsample_bylevel': 1,
    'lambda': 1,
    'alpha': 0,
    'tree_method': 'approx',
# not deafauts
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'num_class': 3,
    'nthread': 8,
    'seed': 42,
    'silent': 1
}

n_estimators = 100


In [37]:
for i in tqdm([0, 1, 2, 3, 4, 5]):
    train_idx = df_train.fold != i
    val_idx = df_train.fold == i

    X_train = df_train[train_idx][features].values
    y_train = df_train[train_idx].interest_level.values
    X_val = df_train[val_idx][features].values
    y_val = df_train[val_idx].interest_level.values

    dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
    dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

    watchlist = [(dtrain, 'train'), (dval, 'val')]

    model = xgb.train(xgb_pars, dtrain, num_boost_round=n_estimators, verbose_eval=10,
                 evals=watchlist)

    ypred = model.predict(dval)

    for i in [0, 1, 2]:
        df_train.loc[val_idx, 'interest_level_xgb_%d' % i] = ypred[:, i]


  0%|          | 0/6 [00:00<?, ?it/s][A

[0]	train-mlogloss:1.02717	val-mlogloss:1.02763





[10]	train-mlogloss:0.786614	val-mlogloss:0.790121
[20]	train-mlogloss:0.748396	val-mlogloss:0.755955
[30]	train-mlogloss:0.736463	val-mlogloss:0.748617
[40]	train-mlogloss:0.728633	val-mlogloss:0.745687
[50]	train-mlogloss:0.722116	val-mlogloss:0.744162
[60]	train-mlogloss:0.716322	val-mlogloss:0.742909
[70]	train-mlogloss:0.710849	val-mlogloss:0.742227
[80]	train-mlogloss:0.70533	val-mlogloss:0.741519
[90]	train-mlogloss:0.700274	val-mlogloss:0.741052


 17%|█▋        | 1/6 [01:43<08:35, 103.08s/it]

[0]	train-mlogloss:1.02683	val-mlogloss:1.02841
[10]	train-mlogloss:0.781886	val-mlogloss:0.793629
[20]	train-mlogloss:0.745401	val-mlogloss:0.765076
[30]	train-mlogloss:0.733425	val-mlogloss:0.760251
[40]	train-mlogloss:0.725684	val-mlogloss:0.758819
[50]	train-mlogloss:0.719554	val-mlogloss:0.757887
[60]	train-mlogloss:0.713504	val-mlogloss:0.757131
[70]	train-mlogloss:0.707591	val-mlogloss:0.756563
[80]	train-mlogloss:0.701708	val-mlogloss:0.756006
[90]	train-mlogloss:0.696565	val-mlogloss:0.755444


 33%|███▎      | 2/6 [03:24<06:50, 102.51s/it]

[0]	train-mlogloss:1.03079	val-mlogloss:1.0336
[10]	train-mlogloss:0.781941	val-mlogloss:0.804328
[20]	train-mlogloss:0.743729	val-mlogloss:0.775759
[30]	train-mlogloss:0.7319	val-mlogloss:0.770517
[40]	train-mlogloss:0.724122	val-mlogloss:0.768882
[50]	train-mlogloss:0.717573	val-mlogloss:0.767932
[60]	train-mlogloss:0.711591	val-mlogloss:0.7666
[70]	train-mlogloss:0.706278	val-mlogloss:0.765927
[80]	train-mlogloss:0.7004	val-mlogloss:0.765255
[90]	train-mlogloss:0.695403	val-mlogloss:0.764486


 50%|█████     | 3/6 [05:04<05:05, 101.86s/it]

[0]	train-mlogloss:1.02696	val-mlogloss:1.02776
[10]	train-mlogloss:0.782617	val-mlogloss:0.792858
[20]	train-mlogloss:0.745972	val-mlogloss:0.764048
[30]	train-mlogloss:0.734131	val-mlogloss:0.759029
[40]	train-mlogloss:0.726668	val-mlogloss:0.757444
[50]	train-mlogloss:0.720056	val-mlogloss:0.756026
[60]	train-mlogloss:0.713802	val-mlogloss:0.755131
[70]	train-mlogloss:0.708069	val-mlogloss:0.754438
[80]	train-mlogloss:0.702753	val-mlogloss:0.753766
[90]	train-mlogloss:0.697458	val-mlogloss:0.753543


 67%|██████▋   | 4/6 [06:44<03:22, 101.32s/it]

[0]	train-mlogloss:1.03161	val-mlogloss:1.03018
[10]	train-mlogloss:0.785855	val-mlogloss:0.78094
[20]	train-mlogloss:0.749731	val-mlogloss:0.747699
[30]	train-mlogloss:0.737946	val-mlogloss:0.741135
[40]	train-mlogloss:0.730266	val-mlogloss:0.73921
[50]	train-mlogloss:0.723633	val-mlogloss:0.738027
[60]	train-mlogloss:0.717936	val-mlogloss:0.737157
[70]	train-mlogloss:0.712553	val-mlogloss:0.736248
[80]	train-mlogloss:0.707236	val-mlogloss:0.7355
[90]	train-mlogloss:0.702012	val-mlogloss:0.73498


 83%|████████▎ | 5/6 [08:39<01:45, 105.51s/it]

[0]	train-mlogloss:1.02821	val-mlogloss:1.02833
[10]	train-mlogloss:0.78647	val-mlogloss:0.786125
[20]	train-mlogloss:0.749281	val-mlogloss:0.751623
[30]	train-mlogloss:0.737446	val-mlogloss:0.744114
[40]	train-mlogloss:0.729626	val-mlogloss:0.741027
[50]	train-mlogloss:0.723566	val-mlogloss:0.73963
[60]	train-mlogloss:0.71745	val-mlogloss:0.738459
[70]	train-mlogloss:0.711973	val-mlogloss:0.737483
[80]	train-mlogloss:0.706601	val-mlogloss:0.736615
[90]	train-mlogloss:0.701257	val-mlogloss:0.735751


100%|██████████| 6/6 [10:21<00:00, 104.33s/it]


In [38]:
print('feature importance:')

scores = model.get_score(importance_type='gain')

for n, s in sorted(scores.items(), key=lambda x: -x[1]):
    print(' - %s: %.4f' % (n, s))

not_used = set(features) - scores.keys()
print('not used features: %s' % not_used)

feature importance:
 - seconds_from_min: 39.7189
 - geometry_interest_0_mtv: 22.8024
 - geometry_interest_2_mtv: 10.9328
 - geometry_interest_1_mtv: 9.1919
 - filesize: 8.7095
 - no_pixels: 7.5591
 - phash_blue_ph1: 6.4866
 - phash_red_ph1: 6.1640
 - moments_overall_ellipse_eccentricity: 6.1422
 - moments_red_I2: 5.9906
 - moments_overall_ellipse_centroid_y: 5.9529
 - moments_red_ellipse_intensity: 5.9463
 - phash_red_ph2: 5.8751
 - moments_green_ellipse_minor: 5.8070
 - imstat_overall_min: 5.8028
 - imstat_blue_mean: 5.7422
 - moments_green_ellipse_eccentricity: 5.7343
 - moments_red_ellipse_major: 5.7179
 - phash_luma_ph2: 5.7117
 - moments_red_ellipse_minor: 5.7099
 - moments_blue_ellipse_intensity: 5.6997
 - moments_overall_ellipse_intensity: 5.6882
 - phash_hue_ph1: 5.5987
 - imstat_red_std: 5.5978
 - imstat_overall_mean: 5.5953
 - phash_luma_ph1: 5.5705
 - imstat_red_skewness: 5.5567
 - moments_overall_I2: 5.5152
 - moments_overall_ellipse_major: 5.5131
 - phash_blue_ph3: 5.5017


In [39]:
X_train = df_train[features].values
y_train = df_train.interest_level.values

dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
watchlist = [(dtrain, 'train')]

model = xgb.train(xgb_pars, dtrain, num_boost_round=n_estimators, verbose_eval=10,
                  evals=watchlist)

[0]	train-mlogloss:1.02994
[10]	train-mlogloss:0.785228
[20]	train-mlogloss:0.749266
[30]	train-mlogloss:0.737736
[40]	train-mlogloss:0.730357
[50]	train-mlogloss:0.724692
[60]	train-mlogloss:0.719521
[70]	train-mlogloss:0.714565
[80]	train-mlogloss:0.709857
[90]	train-mlogloss:0.705578


In [40]:
dtest = xgb.DMatrix(df_test[features], feature_names=features)

In [41]:
ytest = model.predict(dtest)

In [55]:
for i in [0, 1, 2]:
    df_test['interest_level_xgb_%d' % i] = ytest[:, i]

In [52]:
export_cols = [
        'listing_id', 'image', 
        'seconds_from_min',
        'filesize',
        'no_pixels',
        'geometry',
        'geometry_interest_0_mtv',
        'geometry_interest_1_mtv',
        'geometry_interest_2_mtv',
        'interest_level_xgb_0',
        'interest_level_xgb_1',
        'interest_level_xgb_2'
    ]

In [53]:
df_train_export = df_train[export_cols + ['interest_level']]
df_test_export = df_test[export_cols]

In [66]:
def stats(vals, prefix, res=None):
    if res is None:
        res = {}
    res[prefix + '_mean'] = np.mean(vals)
    res[prefix + '_min'] = np.min(vals)
    res[prefix + '_max'] = np.max(vals)
    res[prefix + '_std'] = np.std(vals)
    return res

In [67]:
def calculate_group_features(group):
    res = {}
    stats(group['seconds_from_min'], 'seconds_from_min', res)
    stats(group['filesize'], 'filesize', res)
    stats(group['no_pixels'], 'no_pixels', res)
    for i in [0, 1, 2]:
        stats(group['geometry_interest_%d_mtv' % i], 'geometry_interest_%d_mtv' % i, res)
        stats(group['interest_level_xgb_%d' % i], 'interest_level_xgb_%d' % i, res)
    res['geometries'] = ' '.join(group['geometry'])
    return pd.Series(res)

In [69]:
df_train_group = df_train_export.groupby('listing_id').apply(calculate_group_features)
df_train_group.reset_index(inplace=1)

In [70]:
df_test_group = df_test_export.groupby('listing_id').apply(calculate_group_features)
df_test_group.reset_index(inplace=1)

In [82]:
feather.write_dataframe(df_train_group, 'dfs/df_train_group.feather')
feather.write_dataframe(df_test_group, 'dfs/df_test_group.feather')