In [1]:
import xlearn as xl 

In [2]:
import numpy as np 
import pandas as pd 
import os 

In [97]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from tqdm.autonotebook import tqdm 

# 使用FM

In [8]:
class FMFormat:
    def __init__(self, vector_feat, onehot_feat, continous_feat):
        self.feature_index = None  # 记录特征索引
        self.vector_feat = vector_feat
        self.onehot_feat = onehot_feat
        self.continous_feat = continous_feat
        
    def fit(self, df):
        self.feature_index = {}
        last_idx = 0
        for col in df.columns:
            ## 如果是one-hot型特征
            if col in self.onehot_feat:
                print("cat", col)
                df[col] = df[col].astype(str)
                ## 该变量对应多少种不同的值
                vals = [v for v in np.unique(df[col].values) if str(v) != "nan"]
                ## 获得对应的特征名
                names = np.asarray(list(map(lambda x: col+"_"+x, vals)))
                tmp = dict(zip(names, range(last_idx, last_idx+len(names))))
                self.feature_index.update(tmp)
                last_idx += len(names)
            elif col in self.vector_feat:
                ## 对于字符串类型的特征
                vals = []
                for data in df[col].astype(str).values:
                    if data != "nan":
                        ## 按照空格划分
                        for word in data.strip().split():
                            vals.append(word)
                vals = np.unique(vals)
                vals = filter(lambda x: x!="nan", vals)
                names = np.asarray(list(map(lambda x: col+"_"+x, vals)))
                tmp = dict(zip(names, range(last_idx, last_idx+len(names))))
                self.feature_index.update(tmp)
                last_idx += len(names)
            elif col in self.continous_feat:
                ## 如果是数值型特征
                print("con: ", col)
                self.feature_index.update({col:last_idx})
                last_idx += 1 
        return self 
    
    ## 对每一行进行转换
    def transform_row_(self, row):
        fm = []
        
        for col, val in row.loc[row != 0].to_dict().items():
            if col in self.onehot_feat:
                if str(val) != "nan":
                    name = f"{col}_{val}"
                    if name in self.feature_index:
                        fm.append("{}:1".format(self.feature_index[name]))
            elif col in self.vector_feat:
                if str(val) != "nan":
                    for word in str(val).split():
                        name = f"{col}_{word}"
                        if name in self.feature_index:
                            fm.append("{}:1".format(self.feature_index[name]))
            elif col in self.continous_feat:
                if str(val) != "nan":
                    fm.append("{}:{}".format(self.feature_index[col], val))
        return " ".join(fm)
    
    def transform(self, df):
        return pd.Series({idx:self.transform_row_(row) for idx, row in df.iterrows()})
    
    
    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)

In [79]:
def convert_to_fm(train_df, test_df=None, vector_fe=[], onehot_fe=[], contin_fe=[], path="./", label=None):
    train_ = train_df.copy()
    test_ = test_df.copy()
    
    if test_df is not None:
        df_ = pd.concat([train_, test_], axis=0, sort=False, ignore_index=True)
    else:
        df_ = train_
        
    trans = FMFormat(vector_fe, onehot_fe, contin_fe)
    user_fm = trans.fit_transform(df_)
    
    train_ = user_fm[:train_df.shape[0]]
    if test_df is not None:
        test_fm = user_fm[train_df.shape[0]:]
    
    if label:
        Y = train_df[label].values
    else:
        raise ValueError("Please give the label")
        
    train_fm = pd.DataFrame()
    train_fm['Label'] = Y.astype(str)
    train_fm['feature'] = train_
    train_fm['all'] = train_fm[['Label', "feature"]].apply(lambda row: " ".join(row),
                                                          axis=1, raw=True)
    train_fm.drop(["Label", "feature"], axis=1, inplace=True)
    
    ## 生成训练集和验证集
    ### 生成训练集
    train_string = ""
    for i in range(int(train_fm.shape[0]*0.8)):
        train_string += train_fm['all'].values[i]
        train_string += "\n"
    train_string = train_string.strip()
    with open(os.path.join(path, "train_fm.txt"), "w", encoding="utf8") as f: 
        f.write(train_string)
    
    ### 生成验证集
    valid_string = ""
    for i in range(int(train_fm.shape[0]*0.8), train_fm.shape[0]):
        valid_string += train_fm['all'].values[i]
        valid_string += '\n'
    valid_string = valid_string.strip()
    with open(os.path.join(path, "valid_fm.txt"), "w", encoding="utf8") as f: 
        f.write(valid_string)
    
    if test_df is not None:
        test_string = ""
        for i in range(test_fm.shape[0]):
            test_string += test_fm.values[i]
            test_string += "\n"
        test_string = test_string.strip()
        with open(os.path.join(path, "test_fm.txt"), "w", encoding="utf8") as f: 
            f.write(test_string)

In [68]:
train = pd.read_csv("../data/criteo/train.csv")

In [69]:
test = pd.read_csv("../data/criteo/test.csv")

In [70]:
train.columns

Index(['Id', 'Label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',
       'I10', 'I11', 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',
       'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',
       'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26'],
      dtype='object')

In [43]:
con = [f for f in train.columns if f.startswith("I") and f!="Id"]
cat = [f for f in train.columns if f.startswith("C")]

## 只使用特征型变量

In [14]:
convert_to_fm(train_df=train, test_df=test, onehot_fe=cat, label="Label")

cat C1
cat C2
cat C3
cat C4
cat C5
cat C6
cat C7
cat C8
cat C9
cat C10
cat C11
cat C12
cat C13
cat C14
cat C15
cat C16
cat C17
cat C18
cat C19
cat C20
cat C21
cat C22
cat C23
cat C24
cat C25
cat C26


In [15]:
fm_model = xl.create_fm()

In [16]:
fm_model.setTrain("./train_fm.txt")
fm_model.setValidate("./valid_fm.txt")


In [17]:
param = {"task": "binary", "lr": 0.1, "lambda": 0.002, "metric": "acc"}

In [18]:
fm_model.fit(param, "./model.out")

In [19]:
fm_model.setTest("./test_fm.txt")

In [20]:
fm_model.setSigmoid()

fm_model.predict("./model.out", './output.txt')

<font size=4 color=red>**验证集准确率0.768750**</font>

## 对连续型特征进行归一化

### 使用正态归一化

In [62]:
def preprocess(train_df, test_df=None, contin_fe=[]):
    '''
    只需要处理连续型特征即可
    '''
    if test_df is not None:
        df_ = pd.concat([train_df, test_df], axis=0, sort=False, ignore_index=True)
    else:
        df_ = train_df
    
    ss = StandardScaler()
    df_[contin_fe] = ss.fit_transform(df_[contin_fe])
    
    train_df = df_[:train_df.shape[0]]
    if test_df is not None:
        test_df = df_[train_df.shape[0]:]
        return train_df, test_df
    return train_df, None

In [63]:
train_df, test_df = preprocess(train, test, contin_fe=con)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [64]:
convert_to_fm(train_df=train_df, test_df=test_df, contin_fe=con, onehot_fe=cat, label="Label")

con:  I1
con:  I2
con:  I3
con:  I4
con:  I5
con:  I6
con:  I7
con:  I8
con:  I9
con:  I10
con:  I11
con:  I12
con:  I13
cat C1
cat C2
cat C3
cat C4
cat C5
cat C6
cat C7
cat C8
cat C9
cat C10
cat C11
cat C12
cat C13
cat C14
cat C15
cat C16
cat C17
cat C18
cat C19
cat C20
cat C21
cat C22
cat C23
cat C24
cat C25
cat C26


KeyError: 'Label'

In [34]:
fm_model = xl.create_fm()

In [35]:
fm_model.setTrain("./train_fm.txt")
fm_model.setValidate("./valid_fm.txt")

In [36]:
param = {"task": "binary", "lr": 0.1, "lambda": 0.002, "metric": "acc"}

In [37]:
fm_model.fit(param, './model.out')

<font color=red size=4>**验证集准确率0.771875**</font>

### 使用最大最小值归一化

In [52]:
def preprocess(train_df, test_df=None, contin_fe=[]):
    '''
    只需要处理连续型特征即可
    '''
    if test_df is not None:
        df_ = pd.concat([train_df, test_df], axis=0, sort=False, ignore_index=True)
    else:
        df_ = train_df
    
    mm = MinMaxScaler()
    df_[contin_fe] = mm.fit_transform(df_[contin_fe])
    
    train_df = df_[:train_df.shape[0]]
    if test_df is not None:
        test_df = df_[train_df.shape[0]:]
        return train_df, test_df
    return train_df, None

In [53]:
train_df, test_df = preprocess(train, test, contin_fe=con)

  return self.partial_fit(X, y)


In [54]:
train.columns

Index(['Id', 'Label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',
       'I10', 'I11', 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',
       'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',
       'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26'],
      dtype='object')

In [46]:
convert_to_fm(train_df=train_df, test_df=test_df, 
              contin_fe=con, onehot_fe=cat, label="Label")

con:  I1
con:  I2
con:  I3
con:  I4
con:  I5
con:  I6
con:  I7
con:  I8
con:  I9
con:  I10
con:  I11
con:  I12
con:  I13
cat C1
cat C2
cat C3
cat C4
cat C5
cat C6
cat C7
cat C8
cat C9
cat C10
cat C11
cat C12
cat C13
cat C14
cat C15
cat C16
cat C17
cat C18
cat C19
cat C20
cat C21
cat C22
cat C23
cat C24
cat C25
cat C26


In [47]:
fm_model = xl.create_fm()

fm_model.setTrain("./train_fm.txt")
fm_model.setValidate("./valid_fm.txt")

param = {"task": "binary", "lr": 0.1, "lambda": 0.002, "metric": "acc"}

fm_model.fit(param, './model.out')

<font size=4 color=red>**验证集准确率0.76875**</font>

## 对连续型特征进行分箱

In [80]:
# 对跨度比较大的连续值进行分箱
## 分箱节点为：0, 25, 50, 75, 95, 100
def cut_bins(train, test=None, contin_fe=[]):
    if test is not None:
        df_ = pd.concat([train, test], axis=0, sort=False, ignore_index=True)
    else:
        df_ = train
    ## 计算几个分位点
    ## 去除所有nan值
    for col in contin_fe:
        ### 去除nan值
        vals = df_[np.isnan(df_[col]).astype('int8') == 0][col].values
        Q0 = np.min(vals)
        Q1 = np.percentile(vals, 25)
        Q2 = np.percentile(vals, 50)
        Q3 = np.percentile(vals, 75)
        Q4 = np.percentile(vals, 95)
        Q5 = np.max(vals)
        bins = [Q0, Q1, Q2, Q3, Q4, Q5]
        bins = sorted(set(bins))
        labels = list(map(str, list(range(len(bins)-1))))
        df_[f"C_{col}"] = pd.cut(df_[col], bins=bins, labels=labels)
    
    train_ = df_[:train.shape[0]]
    if test is not None:
        test_ = df_[train.shape[0]:]
        return train_, test_
    return train_, None

In [81]:
train_df, test_df = cut_bins(train, test, contin_fe=con)

In [82]:
train_df.columns

Index(['Id', 'Label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',
       'I10', 'I11', 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',
       'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',
       'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C_I1',
       'C_I2', 'C_I3', 'C_I4', 'C_I5', 'C_I6', 'C_I7', 'C_I8', 'C_I9', 'C_I10',
       'C_I11', 'C_I12', 'C_I13'],
      dtype='object')

In [83]:
new_cat = [f for f in train_df.columns if f.startswith("C")]

In [84]:
convert_to_fm(train_df=train_df, test_df=test_df,
             onehot_fe=new_cat, label="Label")

cat C1
cat C2
cat C3
cat C4
cat C5
cat C6
cat C7
cat C8
cat C9
cat C10
cat C11
cat C12
cat C13
cat C14
cat C15
cat C16
cat C17
cat C18
cat C19
cat C20
cat C21
cat C22
cat C23
cat C24
cat C25
cat C26
cat C_I1
cat C_I2
cat C_I3
cat C_I4
cat C_I5
cat C_I6
cat C_I7
cat C_I8
cat C_I9
cat C_I10
cat C_I11
cat C_I12
cat C_I13


In [85]:
fm_model = xl.create_fm()

fm_model.setTrain("./train_fm.txt")
fm_model.setValidate("./valid_fm.txt")

param = {"task": "binary", "lr": 0.1, "lambda": 0.002, "metric": "acc"}

fm_model.fit(param, './model.out')

<font size=4 color=red>**验证集准确率0.771875**</font>

# 使用FFM

In [86]:
# 定义将数据转换为xlearn格式的数据
class FFMFormat:
    def __init__(self, vector_feat, one_hot_feat, continus_feat):
        '''
        vector_feat: 表示多个有意义的字符组成的特征，可以理解为向量型特征，缺失值用"-1"填充 
        one_hot_feat: 表示可以使用One-hot编码的特征，缺失值使用-1填充 
        continus_feat: 表示连续型特征，经过归一化处理的 
        '''
        self.field_index_ = None  # 记录场索引信息
        self.feature_index_ = None # 记录特征索引信息
        self.vector_feat = vector_feat
        self.one_hot_feat = one_hot_feat
        self.continus_feat = continus_feat
        
    def fit(self, df):
        ## 每一列对应一个场
        self.field_index_ = {col: i for i, col in enumerate(df.columns)}
        self.feature_index_ = {}
        last_idx = 0 
        for col in tqdm(df.columns):
            ## 如果对应列是one-hot型特征
            if col in self.one_hot_feat:
                print("cat: ", col)
                df[col] = df[col].astype(str)
                ## 求出该变量中共有多少种不同的值
                vals = [v for v in np.unique(df[col].values) if str(v) != "nan"]
                ## 获得对应的one-hot只有的特征名
                names = np.asarray(list(map(lambda x: col+"_"+x, vals)))
                tmp = dict(zip(names, range(last_idx, last_idx+len(names))))
                self.feature_index_[col] = tmp
                last_idx += len(names)
            elif col in self.vector_feat:
                ## 这是字符串型特征
                vals = []
                for data in df[col].apply(str):
                    if data != "nan":
                        ## 按照空格进行分割
                        for word in data.strip().split():
                            vals.append(word)
                vals = np.unique(vals)
                vals = filter(lambda x: x!="nan", vals)
                names = np.asarray(list(map(lambda x: col+"_"+x, vals)))
                tmp = dict(zip(names, range(last_idx, last_idx+len(names))))
                self.feature_index_[col] = tmp
                last_idx += len(names)
            elif col in self.continus_feat:
                ## 最后如果是数值型特征
                print("con: ", col)
                self.feature_index_[col] = last_idx
                last_idx += 1 
        return self 
    
    # 对每一行进行转换
    def transform_row_(self, row):
        ffm = []
        
        for col, val in row.loc[row != 0].to_dict().items():
            if col in self.one_hot_feat:
                name = f"{col}_{val}"
                if name in self.feature_index_[col]:
                    ffm.append("{}:{}:1".format(self.field_index_[col], self.feature_index_[col][name]))
            elif col in self.vector_feat:
                for word in str(val).split():
                    name = f"{col}_{word}"
                    if name in self.feature_index_[col]:
                        ffm.append("{}:{}:1".format(self.field_index_[col], self.feature_index_[col][name]))
            elif col in self.continus_feat:
                if str(val) != "nan": 
                    ffm.append("{}:{}:{}".format(self.field_index_[col], self.feature_index_[col], val))
        return " ".join(ffm)
    
    def transform(self, df):
        return pd.Series({idx: self.transform_row_(row) for idx, row in tqdm(df.iterrows())})
    
    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)

In [87]:
def convert_to_ffm(train_df, test_df=None, vector_fe=[], onehot_fe=[], contin_fe=[], path="./", label=None):
    
    train_ = train_df.copy()
    test_ = test_df.copy()
    
    if test_df is not None:
        df_ = pd.concat([train_, test_], axis=0, sort=False, ignore_index=True)
    else:
        df_ = train_
    
    trans = FFMFormat(vector_fe, onehot_fe, contin_fe)
    user_ffm = trans.fit_transform(df_)
    
    train_ = user_ffm[:train_df.shape[0]]
    if test_df is not None:
        test_ffm = user_ffm[train_df.shape[0]:]
    
    if label:
        Y = train_df[label].values
    else:
        raise ValueError("Please give the label")
    
    train_ffm = pd.DataFrame()
    train_ffm["Label"] = Y.astype(str) 
    train_ffm["feature"] = train_
    train_ffm['all'] = train_ffm[['Label', "feature"]].apply(lambda row: " ".join(row), axis=1, raw=True)
    train_ffm.drop(["Label", "feature"], axis=1, inplace=True)
    
    
    ## 生成训练集和验证集
    ### 生成训练集
    train_string = ""
    for i in range(int(train_ffm.shape[0]*0.8)):
        train_string += train_ffm['all'].values[i]
        train_string += "\n"
    train_string = train_string.strip()
    with open(os.path.join(path, "train_ffm.txt"), "w", encoding="utf8") as f: 
        f.write(train_string)
    
    ### 生成验证集
    valid_string = ""
    for i in range(int(train_ffm.shape[0]*0.8), train_ffm.shape[0]):
        valid_string += train_ffm['all'].values[i]
        valid_string += '\n'
    valid_string = valid_string.strip()
    with open(os.path.join(path, "valid_ffm.txt"), "w", encoding="utf8") as f: 
        f.write(valid_string)
    
    if test_df is not None:
        test_string = ""
        for i in range(test_ffm.shape[0]):
            test_string += test_ffm.values[i]
            test_string += "\n"
        test_string = test_string.strip()
        with open(os.path.join(path, "test_ffm.txt"), "w", encoding="utf8") as f: 
            f.write(test_string)

## 使用归一化之后的连续特征

In [88]:
def preprocess(train_df, test_df=None, contin_fe=[]):
    '''
    只需要处理连续型特征即可
    '''
    if test_df is not None:
        df_ = pd.concat([train_df, test_df], axis=0, sort=False, ignore_index=True)
    else:
        df_ = train_df
    
    ss = StandardScaler()
    df_[contin_fe] = ss.fit_transform(df_[contin_fe])
    
    train_df = df_[:train_df.shape[0]]
    if test_df is not None:
        test_df = df_[train_df.shape[0]:]
        return train_df, test_df
    return train_df, None

In [94]:
train_df, test_df = preprocess(train, test, contin_fe=con)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [98]:
convert_to_ffm(train_df=train_df, test_df=test_df, contin_fe=con, onehot_fe=cat,
              label="Label")

HBox(children=(IntProgress(value=0, max=41), HTML(value='')))

con:  I1
con:  I2
con:  I3
con:  I4
con:  I5
con:  I6
con:  I7
con:  I8
con:  I9
con:  I10
con:  I11
con:  I12
con:  I13
cat:  C1
cat:  C2
cat:  C3
cat:  C4
cat:  C5
cat:  C6
cat:  C7
cat:  C8
cat:  C9
cat:  C10
cat:  C11
cat:  C12
cat:  C13
cat:  C14
cat:  C15
cat:  C16
cat:  C17
cat:  C18
cat:  C19
cat:  C20
cat:  C21
cat:  C22
cat:  C23
cat:  C24
cat:  C25
cat:  C26



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [99]:
ffm_model = xl.create_ffm()

ffm_model.setTrain("./train_ffm.txt")
ffm_model.setValidate("./valid_ffm.txt")

param = {"task": "binary", "lr": 0.1, "lambda": 0.002, "metric": "acc"}

ffm_model.fit(param, './model.out')

<font size=4 color=red>**验证集准确率0.7750**</font>

## 对连续型特征进行分箱

In [100]:
train_df, test_df = cut_bins(train, test, contin_fe=con)

In [101]:
new_cat = [f for f in train_df if f.startswith("C")]

In [103]:
convert_to_ffm(train_df, test_df, onehot_fe=new_cat, label="Label")

HBox(children=(IntProgress(value=0, max=54), HTML(value='')))

cat:  C1
cat:  C2
cat:  C3
cat:  C4
cat:  C5
cat:  C6
cat:  C7
cat:  C8
cat:  C9
cat:  C10
cat:  C11
cat:  C12
cat:  C13
cat:  C14
cat:  C15
cat:  C16
cat:  C17
cat:  C18
cat:  C19
cat:  C20
cat:  C21
cat:  C22
cat:  C23
cat:  C24
cat:  C25
cat:  C26
cat:  C_I1
cat:  C_I2
cat:  C_I3
cat:  C_I4
cat:  C_I5
cat:  C_I6
cat:  C_I7
cat:  C_I8
cat:  C_I9
cat:  C_I10
cat:  C_I11
cat:  C_I12
cat:  C_I13



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [104]:
ffm_model = xl.create_ffm()

ffm_model.setTrain("./train_ffm.txt")
ffm_model.setValidate("./valid_ffm.txt")

param = {"task": "binary", "lr": 0.1, "lambda": 0.002, "metric": "acc"}

ffm_model.fit(param, './model.out')

<font color=red size=4>**验证集准确率0.771875**</font>