## Imports

In [6]:
import numpy as np
import keras
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.mixture import BayesianGaussianMixture
from sklearn.cluster import MiniBatchKMeans
import collections
from functools import partial,reduce

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, Imputer, StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.dummy import DummyRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.decomposition import PCA, TruncatedSVD

## Data

In [7]:
from sklearn.datasets import make_classification
def prepare_catergorical_data():
    X,y = make_classification(n_samples=100, n_features=10)
    X = np.array(list(map(lambda col:list(map(lambda x:int(10.0*np.abs(x)),col)),X)) )
    return X,y



## Encoding

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Activation,Flatten
from keras.layers.embeddings import Embedding


In [9]:
#n_words = np.unique(X)
#K = 10
#input_array = X
class my_model:
    def __call__(self,X_col,K):
        input_array = X_col
        n_words = np.unique(input_array)
        model = Sequential()
        model.add(Embedding(len(n_words), K, input_length=1))
        model.add(Flatten())
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
        print(model.summary())
        return model
# Train the model, iterating on the data in batches of 32 samples
#model = my_model()(X,3)
#model.fit(X, y, epochs=100, batch_size=32)


In [29]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]


class StringIndexer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.dictionaries = dict()
        self.columns = list()

    def fit(self, X, y=None):
        self.columns = X.columns.values
        for col in self.columns:
            categories = np.unique(X[col])
            self.dictionaries[col] = dict(zip(categories, range(len(categories))))
        return self

    def transform(self, X):
        column_array = []
        for col in self.columns:
            dictionary = self.dictionaries[col]
            na_value = len(dictionary) + 1
            transformed_column = X[col].apply(lambda x: dictionary.get(x, na_value))
            column_array.append(transformed_column.values.reshape(-1, 1))
        return np.hstack(column_array)
    
def unravel_tree(dict_,name_key):
    agg_dic =dict()
    entry_name = name_key
    def contains_dicts(dict_):
        return reduce(lambda _check,item:_check | isinstance(item[1],dict),dict_.items(),False)
    non_dictItems_dic = dict()
    for item in dict_.items():
        if not( isinstance(item[1],dict) ):
            non_dictItems_dic.update({item[0]:item[1]})
        else:
            agg_dic.update(unravel_tree(item[1],name_key+'_'+str(item[0])) )
    if non_dictItems_dic:
        agg_dic.update({name_key:non_dictItems_dic})
    return agg_dic
def set_output(x):
    if isinstance(x,dict):
        return pd.DataFrame(x,index=[0])
    return pd.DataFrame({'':x},index=[0])
class FuncEncoder(BaseEstimator, TransformerMixin):
    def __init__(self,columns = None,drop_old_columns = True,list_func=['mean','std','median']):
        self.dictionaries = dict()
        self.columns = list()
        self.list_func = list_func
        self.columns = columns
        self.drop_old_columns = drop_old_columns
    def fit(self, X, y=None):
        
        if self.columns is None:
            self.columns = X.columns
            
        print(self.columns)
        
        X['TARGET'] = y
        
        for col in self.columns:
            col_dict = dict()
            func_dict = {
                'mean':lambda x:x.mean(),
                'std':lambda x:x.std(),
                'median':lambda x:x.median()
            }
        
            for func in self.list_func:
                if isinstance(func,dict):
                    for fun_name in func.keys():
                        col_dict.update({fun_name:X[[col,'TARGET'] ]\
                                        .groupby([col])\
                                        .apply(lambda x:set_output(func[fun_name](x['TARGET'])) )\
                                        .reset_index(level=1,drop=True)
                                         .to_dict()
                                    })
                elif isinstance(func,str):
                    col_dict.update({func:X[[col,'TARGET']]\
                                        .groupby([col])\
                                        .apply(lambda x:set_output(func_dict[func](x['TARGET'])) )\
                                        .reset_index(level=1,drop=True)
                                        .to_dict()
                                    })
                else:
                    raise NameError('Invalid Format: Function input needs to be either a str or dict(fun_name:fun)')
            
            self.dictionaries[col] = col_dict
        # drop TARGET - column
        X = X.drop('TARGET',1)
        return self

    def transform(self, X):
        for col in self.columns:
            dictionary = unravel_tree(self.dictionaries[col],str(col))
            for func_key in dictionary.keys():
                X[str(func_key)] = X[col].apply(lambda x: dictionary[func_key].get(x,np.nan)).values
        if self.drop_old_columns:
            X = X.drop(self.columns, 1)
        if 'TARGET' in X.columns:
            X = X.drop('TARGET',1)
        return X

def _k_means(x):
    n_clust = 2
    list_func = ['std']
    x = x
    if len(x.values)> n_clust+1:
        bgm = MiniBatchKMeans(n_clusters=n_clust,random_state=0)
        temp_df = pd.DataFrame()
        temp_df['clust_label'] = bgm.fit_predict(X=x.values.reshape(-1, 1))
        temp_df['TARGET'] = x.values
        values_dict = temp_df.groupby(['clust_label']).agg(['mean']+list_func).values
        values_list = list(sorted(values_dict,key=lambda y:y[1]) ) 
        if len(values_list)< (len(list_func)*n_clust):
            values_list = np.array([np.nan]*((n_clust-1)*(len(list_func)+1))+ [np.mean(x.values)] + [np.nan]*len(list_func) )
            return values_list
        return reduce(lambda x,y:x+list(y),values_list,[])
    #else
    values_list = np.array([np.nan]*((n_clust-1)*(len(list_func)+1))+ [x.values[0]] + [np.nan]*len(list_func) )
    return  values_list
#_k_means(data_final['delta_minutes'])


In [30]:
X,y = prepare_catergorical_data()

df = pd.DataFrame(data=X.copy())
encoder= FuncEncoder(list_func=['mean','std','median',{'ben':lambda x:x.count() }])

X_new = encoder.fit_transform(df,y)
for col in X_new.columns:
    print(col)

RangeIndex(start=0, stop=10, step=1)
0_median_
0_mean_
0_std_
0_ben_
1_ben_
1_median_
1_std_
1_mean_
2_ben_
2_median_
2_std_
2_mean_
3_median_
3_mean_
3_std_
3_ben_
4_std_
4_ben_
4_mean_
4_median_
5_median_
5_std_
5_mean_
5_ben_
6_ben_
6_mean_
6_std_
6_median_
7_mean_
7_median_
7_std_
7_ben_
8_mean_
8_std_
8_median_
8_ben_
9_std_
9_mean_
9_ben_
9_median_


In [52]:

df[[0,1]].groupby([0]).apply(lambda x:pd.DataFrame({'b':np.sum(x.values),'c':np.mean(x.values)},index={'0':1}) ).reset_index(level=1,drop=True).to_dict()

{'b': {0: 83,
  1: 62,
  2: 31,
  3: 68,
  4: 116,
  5: 120,
  6: 146,
  7: 99,
  8: 46,
  9: 168,
  10: 106,
  11: 152,
  12: 108,
  13: 81,
  14: 21,
  15: 35,
  17: 132,
  18: 69,
  19: 23,
  20: 32,
  21: 23,
  22: 36,
  34: 39},
 'c': {0: 5.1875,
  1: 6.2000000000000002,
  2: 3.1000000000000001,
  3: 5.666666666666667,
  4: 7.25,
  5: 6.0,
  6: 9.125,
  7: 9.9000000000000004,
  8: 11.5,
  9: 10.5,
  10: 10.6,
  11: 10.857142857142858,
  12: 10.800000000000001,
  13: 10.125,
  14: 10.5,
  15: 17.5,
  17: 13.199999999999999,
  18: 17.25,
  19: 11.5,
  20: 16.0,
  21: 11.5,
  22: 18.0,
  34: 19.5}}

In [50]:
df[[0,1]].groupby([0]).apply(lambda x:pd.DataFrame({np.mean(x.values)},index=0) )

PandasError: DataFrame constructor not properly called!