In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from time import time
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

In [2]:
np.random.seed(42)
clicks_1k=pd.read_csv('raw_data/train_1k.csv')
all_cols=clicks_1k.columns
not_load_cols=['id']

load_cols=all_cols.difference(not_load_cols)
load_types={}
for c in load_cols:
    if c == 'click':
        load_types[c]='int8'
    elif c == 'hour':
        load_types[c]='int32'
    else:
        load_types[c]='category'

traintst=pd.read_csv('raw_data/traintst.csv',dtype=load_types,usecols=load_cols)
traintst.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45006431 entries, 0 to 45006430
Data columns (total 23 columns):
click               int8
hour                int32
C1                  category
banner_pos          category
site_id             category
site_domain         category
site_category       category
app_id              category
app_domain          category
app_category        category
device_id           category
device_ip           category
device_model        category
device_type         category
device_conn_type    category
C14                 category
C15                 category
C16                 category
C17                 category
C18                 category
C19                 category
C20                 category
C21                 category
dtypes: category(21), int32(1), int8(1)
memory usage: 2.1 GB


In [3]:
def df_to_ffm(x,y,file,cat_cols):
    x.index=np.arange(x.shape[0])
    y.index=np.arange(x.shape[0])
    cat_idx=set([-1])
    for i in range(x.shape[1]):
        if x.columns[i] in cat_cols:
            cat_idx.add(i)
            
    with open(file,'w') as f:
        for i in range(x.shape[0]):
            s=str(y[i])
            
            for j in range(x.shape[1]):
                if j in cat_idx:
                    s += ' %d:%d:1' % (j,x.iat[i,j])
                else:
                    s += ' %d:%d:1' % (j,x.iat[i,j])
                    
            f.write(s+'\n')
        
        
def ffm_encode(df):
    cat_cols = df.select_dtypes(include=['category']).columns
    for c in cat_cols:
        l=LabelEncoder()
        df[c]=df[c].astype('str')
        df[c]=l.fit_transform(df[c]).astype('int32')
    
    return cat_cols

df=pd.DataFrame({'a':np.arange(5).astype('float'),'b':['a','b','a','c','b'],'c':['b','a','d','c','a'],'d':np.arange(5),'e':np.arange(5)})
df.b=df.b.astype('category')
df.c=df.c.astype('category')
df.d=df.d.astype('category')
c_cols=ffm_encode(df)
df_to_ffm(df.drop(['e'],axis=1),df.e,'ffm_data/t.o',c_cols)
with open('ffm_data/t.o') as f:
    print(f.read())

0 0:0:1 1:0:1 2:1:1 3:0:1
1 0:1:1 1:1:1 2:0:1 3:1:1
2 0:2:1 1:0:1 2:3:1 3:2:1
3 0:3:1 1:2:1 2:2:1 3:3:1
4 0:4:1 1:1:1 2:0:1 3:4:1



In [4]:
t=time()
# traintst=pd.concat([train_clicks,test_clicks])
traintst['time']=traintst.hour
traintst['hour']=(traintst.time%100).astype('int8')
traintst['day']=((traintst.time-traintst.hour)//100%100).astype('int8')
traintst.info()

test_day=traintst.iloc[-1].day
val_day=test_day-1
train_len=traintst[traintst.day<val_day].shape[0]
train_val_len=traintst[traintst.day<test_day].shape[0]

traintst['hour']=traintst.hour.astype('category')
traintst['day']=traintst.day.astype('category')

c_cols=ffm_encode(traintst)
traintst.drop(['time','day'],axis=1,inplace=True)
traintst.info()

train_c=traintst.iloc[:train_len]
df_to_ffm(train_c.drop('click',axis=1),train_c.click.fillna(0),'ffm_data/train.ffm',c_cols)

val_c=traintst.iloc[train_len:train_val_len]
df_to_ffm(val_c.drop('click',axis=1),val_c.click.fillna(0),'ffm_data/val.ffm',c_cols)

test_c=traintst.iloc[train_val_len:]
df_to_ffm(test_c.drop('click',axis=1),test_c.click.fillna(0),'ffm_data/test.ffm',c_cols)
    
print('time cost %ds'%int(time()-t))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45006431 entries, 0 to 45006430
Data columns (total 25 columns):
click               int8
hour                int8
C1                  category
banner_pos          category
site_id             category
site_domain         category
site_category       category
app_id              category
app_domain          category
app_category        category
device_id           category
device_ip           category
device_model        category
device_type         category
device_conn_type    category
C14                 category
C15                 category
C16                 category
C17                 category
C18                 category
C19                 category
C20                 category
C21                 category
time                int32
day                 int8
dtypes: category(21), int32(1), int8(3)
memory usage: 2.2 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45006431 entries, 0 to 45006430
Data columns (total 23 columns):