In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from time import time
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

In [2]:
# with open('raw_data/test_filled.csv','w') as f:
#     for i,line in enumerate(open('raw_data/test.csv')):
#         if i > 0:
#             p=line.index(',')
#             f.write(line[:p]+',0'+line[p:])

In [3]:
np.random.seed(42)
clicks_1k=pd.read_csv('raw_data/train_1k.csv')
all_cols=clicks_1k.columns
not_load_cols=['id','device_id','device_ip']

load_cols=all_cols.difference(not_load_cols)
load_types={}
for c in load_cols:
    if c == 'click':
        load_types[c]='int8'
    elif c == 'hour':
        load_types[c]='int32'
    elif c == 'banner_pos':
        load_types[c]='int8'
    elif c == 'device_type':
        load_types[c]='int8'
    elif c == 'device_conn_type':
        load_types[c]='int8'
    else:
        load_types[c]='category'
# train_clicks=pd.read_csv('raw_data/train.csv',dtype=load_types,usecols=load_cols)
# test_clicks=pd.read_csv('raw_data/test.csv',dtype=load_types,usecols=load_cols.difference(['click']))
# test_clicks['click']=np.zeros(test_clicks.shape[0]).astype('int8')
# train_clicks.info()
# test_clicks.info()

traintst=pd.read_csv('raw_data/traintst.csv',dtype=load_types,usecols=load_cols)
traintst.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45006431 entries, 0 to 45006430
Data columns (total 21 columns):
click               int8
hour                int32
C1                  category
banner_pos          int8
site_id             category
site_domain         category
site_category       category
app_id              category
app_domain          category
app_category        category
device_model        category
device_type         int8
device_conn_type    int8
C14                 category
C15                 category
C16                 category
C17                 category
C18                 category
C19                 category
C20                 category
C21                 category
dtypes: category(16), int32(1), int8(4)
memory usage: 1.3 GB


In [4]:
def df_to_fm(x,y,file,cat_cols):
    x.index=np.arange(x.shape[0])
    y.index=np.arange(x.shape[0])
    cat_idx=set([-1])
    for i in range(x.shape[1]):
        if x.columns[i] in cat_cols:
            cat_idx.add(i)
            
    with open(file,'w') as f:
        for i in range(x.shape[0]):
            s=str(y[i])
            
            for j in range(x.shape[1]):
                if j in cat_idx:
                    s += ' %d:1' % x.iat[i,j]
                else:
                    s += ' %d:%s' % (j,x.iat[i,j])
                    
            f.write(s+'\n')
        
        
def fm_encode(df):
    cat_cols = df.select_dtypes(include=['category']).columns
    fm_idx=df.shape[1]
    for c in cat_cols:
        df[c]=df[c].cat.codes.astype('int32')+fm_idx+1
        fm_idx=np.max(df[c])
    
    return cat_cols

df=pd.DataFrame({'a':np.arange(5).astype('float'),'b':['a','b','a','c','b'],'c':['b','a','d','c','a'],'d':np.arange(5),'e':np.arange(5)})
df.b=df.b.astype('category')
df.c=df.c.astype('category')
df.d=df.d.astype('category')
c_cols=fm_encode(df)
df_to_fm(df.drop(['e'],axis=1),df.e,'fm_data/t.o',c_cols)
with open('fm_data/t.o') as f:
    print(f.read())

0 0:0.0 6:1 10:1 13:1
1 0:1.0 7:1 9:1 14:1
2 0:2.0 6:1 12:1 15:1
3 0:3.0 8:1 11:1 16:1
4 0:4.0 7:1 9:1 17:1



In [5]:
t=time()
# traintst=pd.concat([train_clicks,test_clicks])
traintst['time']=traintst.hour
traintst['hour']=(traintst.time%100).astype('int8')
traintst['day']=((traintst.time-traintst.hour)//100%100).astype('int8')
traintst.drop(['time'],axis=1,inplace=True)
traintst.info()

test_day=traintst.iloc[-1].day
val_day=test_day-1
train_len=traintst[traintst.day<val_day].shape[0]
train_val_len=traintst[traintst.day<test_day].shape[0]

traintst['hour']=traintst.hour.astype('category')
traintst['day']=traintst.day.astype('category')

c_cols=fm_encode(traintst)
traintst.info()

train_c=traintst.iloc[:train_len]
df_to_fm(train_c.drop('click',axis=1),train_c.click.fillna(0),'fm_data/train.fm',c_cols)

val_c=traintst.iloc[train_len:train_val_len]
df_to_fm(val_c.drop('click',axis=1),val_c.click.fillna(0),'fm_data/val.fm',c_cols)

test_c=traintst.iloc[train_val_len:]
df_to_fm(test_c.drop('click',axis=1),test_c.click.fillna(0),'fm_data/test.fm',c_cols)
    
print('time cost %ds'%int(time()-t))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45006431 entries, 0 to 45006430
Data columns (total 22 columns):
click               int8
hour                int8
C1                  category
banner_pos          int8
site_id             category
site_domain         category
site_category       category
app_id              category
app_domain          category
app_category        category
device_model        category
device_type         int8
device_conn_type    int8
C14                 category
C15                 category
C16                 category
C17                 category
C18                 category
C19                 category
C20                 category
C21                 category
day                 int8
dtypes: category(16), int8(6)
memory usage: 1.3 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45006431 entries, 0 to 45006430
Data columns (total 22 columns):
click               int8
hour                int32
C1                  int32
banner_pos          int8
sit