In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

from time import time

import ffm

In [2]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
sites = ['site%s' % i for i in range(1, 11)]

df_train = pd.read_csv('train_sessions.csv', dtype={s: 'str' for s in sites})

In [4]:
for i in range(1, 11):
    s = 'site%d' % i
    df_train[s] = df_train[s].fillna('na')
    
    t = 'time%d' % i
    df_train[t] = pd.to_datetime(df_train[t])

In [5]:
df_train = df_train.sort_values(by='time1').reset_index(drop=True)

In [6]:
df = df_train
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [7]:
cv = TfidfVectorizer(token_pattern='\S+', min_df=10, max_df=0.5, stop_words={'na'}, ngram_range=(1, 3))
X_ohe = cv.fit_transform(df_train.sites)

In [8]:
df_train['hour_start'] = df_train.time1.dt.hour
df_train['weekday'] = df_train.time1.dt.weekday

In [9]:
from sklearn.preprocessing import OneHotEncoder

In [10]:
time_ohe = OneHotEncoder(dtype=np.uint8)
X_time = time_ohe.fit_transform(df_train[['hour_start', 'weekday']])

In [11]:
y = df_train.target.values
X = sp.hstack([X_ohe, X_time], format='csr')

In [12]:
n = len(df_train) // 10

X_train = X[:-n]
y_train = y[:-n]

X_val = X[-n:]
y_val = y[-n:]

In [13]:
def gen_fft_rows(X):
    n, _ = X.shape
    res = []
    
    for i in range(n):
        row = X.getrow(i)
        ffm_row = [(1, f, v) for (f, v) in zip(row.indices, row.data)]
        res.append(ffm_row)
    
    return res

In [14]:
rows_train = gen_fft_rows(X_train)
ffm_train = ffm.FFMData(rows_train, y_train)

In [15]:
rows_val = gen_fft_rows(X_val)
ffm_val = ffm.FFMData(rows_val, y_val)

In [16]:
model = ffm.FFM(eta=0.05, lam=0.000001, k=8)
model.init_model(ffm_train)

for i in range(5):
    print('iteration %d, ' % i, end='')
    model.iteration(ffm_train)

    y_pred = model.predict(ffm_val)
    auc = roc_auc_score(y_val, y_pred)
    print('train auc %.4f' % auc)

iteration 0, train auc 0.9724
iteration 1, train auc 0.9769
iteration 2, train auc 0.9805
iteration 3, train auc 0.9806
iteration 4, train auc 0.9809


In [17]:
rows = gen_fft_rows(X)
ffm_data = ffm.FFMData(rows, y)

In [18]:
model = ffm.FFM(eta=0.05, lam=0.000001, k=8)
model.init_model(ffm_data)

for i in range(5):
    print('iteration %d, ' % i, end='')
    model.iteration(ffm_train)

iteration 0, iteration 1, iteration 2, iteration 3, iteration 4, 

In [19]:
df_test = pd.read_csv('test_sessions.csv', dtype={s: 'str' for s in sites})

for i in range(1, 11):
    s = 'site%d' % i
    df_test[s] = df_test[s].fillna('na')

    t = 'time%d' % i
    df_test[t] = pd.to_datetime(df_test[t])

In [20]:
df_test['hour_start'] = df_test.time1.dt.hour
df_test['weekday'] = df_test.time1.dt.weekday

In [21]:
df = df_test
df['sites'] = df.site1 + ' ' + df.site2 + ' ' + df.site3 + ' ' + df.site4 + ' ' + df.site5 + ' ' + \
              df.site6 + ' ' + df.site7 + ' ' + df.site8 + ' ' + df.site9 + ' ' + df.site10  

In [22]:
X_test_ohe = cv.transform(df_test.sites)
X_test_time = time_ohe.transform(df_test[['hour_start', 'weekday']])

In [23]:
X_test = sp.hstack([X_test_ohe, X_test_time], format='csr')

In [24]:
y_test = np.zeros(X_test.shape[0])

In [25]:
rows = gen_fft_rows(X_test)
ffm_test = ffm.FFMData(rows, y_test)

In [29]:
pred = model.predict(ffm_test)

In [44]:
df_res = pd.DataFrame()
df_res['session_id'] = df_test.session_id
df_res['target'] = np.log(pred)

In [45]:
df_res.to_csv('benchmark_ffm.csv', index=False)
!gzip benchmark_ffm.csv

- CV: 0.928, LB: 0.92081
- CV: 0.981, LB: 0.94803