In [1]:
import xlearn as xl
import pandas as pd
import numpy as np

from models.base import create_user, site_app_split

In [2]:
def make_field_dict(df, fields):
    """
    fields: Array[String] - a list of column names
    A field dictionary is just an inverted column index.
    """
    return {col: i for i, col in enumerate(fields)}

def make_feature_dict(df, fields):
    # prepend a field name to each feature in order to distinguish 
    # a feature name present in two or more fields.
    for c in fields:
        df[c] = f'{c}_' + df[c].astype('str')
    # TODO: we could hash all features at this stage.
    # TODO: hash into a smaller space to make the dict smaller
    #df[fields] = df[fields].applymap(hash)
    # Index features from all fields.
    features_concat = pd.concat([df[c] for c in fields], ignore_index=True)
    uniques = features_concat.unique()
    return pd.Series(np.arange(len(uniques)),index=uniques)

def encode_features(df, feature_dict, fields):
    # df.replace(feature_dict) doesn't fit in memory.
    # optimize by splitting the feature_dict into dictionaries corresponding to fields.
    replace_dict = dict()
    for c in fields:
        replace_dict[c] = {k: v for k, v in feature_dict.items() 
                           if k.startswith(c)}
    
    return df.replace(replace_dict)

In [3]:
categorical_features = ['banner_pos', 'platform_id', 'platform_domain', 'platform_category',
                    'user', 'device_conn_type', 'C14','C17','C20','C21']

## Develop splits for cross-validation
Need to develop a cross-validation suite for the libffm format.
Start with last day vs rest.

## Encode while writing to file
04m30s to write site-small, 02m30s for app-small.

In [4]:
df = pd.read_csv('./data/train_tiny.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")
df = create_user(df)
df_site, df_app = site_app_split(df)
df_site = df_site[categorical_features + ['click']]
df_app = df_app[categorical_features + ['click']]
df_site.head()

Unnamed: 0,banner_pos,platform_id,platform_domain,platform_category,user,device_conn_type,C14,C17,C20,C21,click
0,0,5b08c53b,7687a86e,3e814130,4665f0a5_36d749e5,0,21834,2523,-1,221,0
1,0,5b08c53b,7687a86e,3e814130,80df79b5_76dc4769,0,20093,2295,100077,23,1
2,0,1fbe01fe,f3845767,28905ebd,693bff3e_4ea23a13,0,21761,2502,-1,221,0
3,0,1a5c1d83,5dddf09e,f028772b,2773b1c8_d787e91b,0,17239,1973,-1,23,0
4,1,b7e9786d,b12b9f85,f028772b,b200f41e_8a4875bd,0,21875,2526,100079,221,0


In [5]:
site_field_dict = make_field_dict(df_site, categorical_features)
site_feature_dict = make_feature_dict(df_site, categorical_features)
#df_site_encoded = encode_features(df_site, site_feature_dict, categorical_features)
#df_site_encoded.head()

In [6]:
app_field_dict = make_field_dict(df_app, categorical_features)
app_feature_dict = make_feature_dict(df_app, categorical_features)
#df_app_encoded = encode_features(df_app, app_feature_dict, categorical_features)

In [7]:
#df_site_encoded.columns[:-1]

In [8]:
def ffm_row_generator(df, feature_dict):
    """
    Convert each row to the libffm format, accroding to a provided dict.
    """
    assert df.columns[-1] == 'click'
    for i, row in df.iterrows():
        ffm_row = []
        ffm_row.append(str(row['click']))
        for i, v in enumerate(row[:-1]):
            ffm_row.append(f'{i}:{feature_dict[v]}:1')
        yield ' '.join(ffm_row)

In [9]:
%%time
with open('./data/train_site_tiny.ffm', 'w') as f:
    for ffm_row in ffm_row_generator(df_site, site_feature_dict):
        f.write(ffm_row)
        f.write('\n')

CPU times: user 277 ms, sys: 0 ns, total: 277 ms
Wall time: 277 ms


In [10]:
%%time
with open('./data/train_app_tiny.ffm', 'w') as f:
    for ffm_row in ffm_row_generator(df_app, app_feature_dict):
        f.write(ffm_row)
        f.write('\n')

CPU times: user 165 ms, sys: 36 µs, total: 165 ms
Wall time: 163 ms


In [11]:
raise

RuntimeError: No active exception to reraise

## `totoruo/FfmEncoder`
Gets stuck at 'converting data...' at line 353.
Try `Bobe24/Dataframe2libffm`.

In [None]:
df = pd.read_csv('./data/train_tiny.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")
df = create_user(df)
df_site, df_app = site_app_split(df)

In [None]:
categorical_features = ['banner_pos', 'platform_id', 'platform_category',
                        'user', 'device_conn_type', 'C14','C17','C20','C21']

In [None]:
from tools.FfmEncoder import FfmEncoder

In [None]:
encoder = FfmEncoder(categorical_features, label_name='click', nthread=2)

In [None]:
encoder.transform(df_app, 'train_app.ffm')

In [None]:
raise

## LibFFM conversion stats: memory issue
With categorical_features = ['banner_pos', 'platform_id', 'platform_domain', 'platform_category',
                        'user', 'device_conn_type', 'C14','C17','C20','C21']
- just a few seconds to build feature_dict for app-small.
- 12s for app-mid. len = 0.8M
- 21s for site-mid. len = 1.6M
- encode_features exhausts memory...

In [None]:
def make_field_dict(df, fields):
    """
    fields: Array[String] - a list of column names
    A field dictionary is just an inverted column index.
    """
    return {col: i for i, col in enumerate(fields)}

def make_feature_dict(df, fields):
    # prepend a field name to each feature in order to distinguish 
    # a feature name present in two or more fields.
    for c in fields:
        df[c] = f'{c}_' + df[c].astype('str')
    # TODO: decide whether to hash all features at this stage.
    # TODO: hash into a smaller space to make the dict smaller
    #df[fields] = df[fields].applymap(hash)
    # Index features from all fields.
    features_concat = pd.concat([df[c] for c in fields], ignore_index=True)
    uniques = features_concat.unique()
    return pd.Series(np.arange(len(uniques)),index=uniques)

def encode_features(df, feature_dict):
    return df.replace(feature_dict)

def encode_features_old(df, feature_dict, fields):
    # df.replace(feature_dict) doesn't fit in memory.
    # optimize by splitting the feature_dict into dictionaries corresponding to fields.
    replace_dict = dict()
    for c in fields:
        replace_dict[c] = {k: v for k, v in feature_dict.items() 
                           if k.startswith(c)}
    
    return df.replace(replace_dict)

In [None]:
df = pd.read_csv('./data/train_small.csv')
df.hour = pd.to_datetime(df.hour, format="%y%m%d%H")
df = create_user(df)
df_site, df_app = site_app_split(df)

In [None]:
#categorical_features = ['banner_pos', 'platform_id', 'platform_domain', 'platform_category',
#                        'user', 'device_conn_type', 'C14','C17','C20','C21']
categorical_features = ['banner_pos', 'platform_id', 'platform_domain', 'platform_category',
                         'device_conn_type', 'C14','C17','C20','C21']

In [None]:
field_dict = make_field_dict(df_site, categorical_features)
feature_dict = make_feature_dict(df_site, categorical_features)

In [None]:
%%time
df_site_encoded = encode_features(df_site, feature_dict)

In [None]:
len(feature_dict)

In [None]:
raise

## Develop LibFFM format converter
Refer to the Data Format section of https://github.com/guestwalk/libffm .

In [None]:
df = pd.read_csv('./data/train_tiny.csv')
categorical_features = ['C1','device_type']
df.head()

In [None]:
field_dict = make_field_dict(df, categorical_features)
feature_dict = make_feature_dict(df, categorical_features)

In [None]:
feature_dict

In [None]:
df_encoded = encode_features(df, feature_dict)

## Quickstart
xl has ffm, fm, and linear models. ffm models only take the libffm format, while others take csv or the libsvm format.

In [None]:
ffm_model = xl.create_ffm()

In [None]:
ffm_model.setTrain('./data/small_train.txt')

In [None]:
# param:
#  0. task: binary classification
#  1. learning rate : 0.2
#  2. regular lambda : 0.002
param = {'task':'binary', 'lr':0.2, 'lambda':0.002}

In [None]:
ffm_model.fit(param, "./model.out")

In [None]:
ffm_model.setTest('./data/small_test.txt')

In [None]:
ffm_model.predict('./model.out', './xlearn-output.txt')

## xlearn with its sklearn API (deprecated)
FFMModel.fit takes array-like. The format of its fields argument is unclear.

In [None]:
ffm = xl.FFMModel(task='binary', lr=0.2, epoch=10, reg_lambda=0.002, metric='acc')
#ffm.fit('./data/small_train.txt')