In [None]:
# %load main.py
import os
import time
import argparse
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.backends.cudnn as cudnn
from tensorboardX import SummaryWriter

import network
import config
import evaluate
import data_utils


parser = argparse.ArgumentParser()
parser.add_argument("--lr", 
    type=float, 
    default=0.05, 
    help="learning rate")
parser.add_argument("--dropout", 
    default='[0.5, 0.2]',  
    help="dropout rate for FM and MLP")
parser.add_argument("--batch_size", 
    type=int, 
    default=128, 
    help="batch size for training")
parser.add_argument("--epochs", 
    type=int,
    default=100, 
    help="training epochs")
parser.add_argument("--hidden_factor", 
    type=int,
    default=64, #被修改，原始值为64
    help="predictive factors numbers in the model")
parser.add_argument("--layers", 
    default='[5000,1000]',#被修改，有两个隐层，之前值为64，一个隐层 
    help="size of layers in MLP model, '[]' is NFM-0")

parser.add_argument("--n_class",
    type=int,
    default=16,
    help="number of class")


parser.add_argument("--lamda", 
    type=float, 
    default=0.0, 
    help="regularizer for bilinear layers")
parser.add_argument("--batch_norm", 
    default=True, 
    help="use batch_norm or not")
parser.add_argument("--pre_train", 
    action='store_true', 
    default=False, 
    help="whether use the pre-train or not")
parser.add_argument("--out", 
    default=True, 
    help="save model or not")

parser.add_argument("--gpu", 
    type=str,
    default="0",  
    help="gpu card ID")
args = parser.parse_args(args=[])#括号中增加args[]

os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
cudnn.benchmark = True








In [None]:
import numpy as np
import torch.utils.data as data
import pandas as pd
import config


def read_features(file, features):
    """ Read features from the given file. """
    #file="data/frappe/c_df_v_fff2000.csv"
    num = len(features)
    fd=pd.read_csv(file,sep=',')
    nrow=fd.shape[0]
    ncol=fd.shape[1]
    n_fd=np.array(fd)
    #print(n_fd)
    n_fd=n_fd[:,2:]
    print('n_fd.shape:')
    print(n_fd.shape)
    #print(n_fd)
    #features={}

    for i, row_list in enumerate(n_fd):
        for j, col_value in enumerate(row_list):
            #print(col_value)
            if col_value not in features:
                features[col_value]=num
                num=num+1

    #print(features)
    return features

In [None]:
def map_features():
    """ Get the number of existing features in all the three files. """
    features = {}
    #features = read_features(csv_data, features)
    
    features = read_features(config.train_libfm, features)
    features = read_features(config.valid_libfm, features)
    features = read_features(config.test_libfm, features)
    
    
    print("number of features: {}".format(len(features)))
    return features, len(features)

In [None]:
def map_features():
    """ Get the number of existing features in all the three files. """
    features = {}
    #features = read_features(csv_data, features)
    
    features = read_features(config.train_libfm, features)
    features = read_features(config.valid_libfm, features)
    features = read_features(config.test_libfm, features)
    
    
    print("number of features: {}".format(len(features)))
    return features, len(features)

In [None]:
def one_hot(labels, classes, label_smoothing=0.2):
    n = len(labels)
    eoff = label_smoothing / classes
    output = np.ones((n, classes), dtype=np.float32) * eoff
    for row, label in enumerate(labels):
        output[row, label] = 1 - label_smoothing + eoff
        print("row:",row,"label:",label)
    return output

In [None]:
class FMData(data.Dataset):
    """ Construct the FM pytorch dataset. """
    def __init__(self, file,label_file, feature_map,n_class=16):
        super(FMData, self).__init__()
        self.label = []
        self.features = []
        self.feature_values = []
        
        features=[]
        #feature_map.keys()
        #self.features=np.array(feature_map)
        #feature_map
        #file="data/frappe/c_df_v_fff2000.csv"
        #num = len(features)
        fd=pd.read_csv(file,sep=',')
        #nrow=fd.shape[0]
        #ncol=fd.shape[1]
        n_fd=np.array(fd)
        #print(n_fd)
        n_fd=n_fd[:,2:]
        for i, item in enumerate(n_fd):
            u=[feature_map[x] for x in item]
            features.append(u)
        
        
        self.features=np.array(features)
        #self.features=features.tolist()
        
        
        nrow,ncol=n_fd.shape
        #ncol=10150
        feature_v=[]
        """
            feature_v=[1 for i in range(ncol)]
            #print(feature_v)
            #feature_values=[feature_v for j in range(nrow)]

            for item in range(nrow):
            feature_values.append(feature_v)
        """

        feature_v=[[1 for i in range(ncol)] for i in range(nrow)]
        self.feature_values=np.array(feature_v)
        #print(feature_value)
        #self.feature_values=feature_value.tolist()
        #feature_map,lenth=map_features()
        #raw = [item for item in  enumerate(n_fd)]
        #print(raw)
        #raw=raw.tolist()
        
        #label_file=[]
        label_fd=pd.read_csv(label_file,sep=',')
        #print(features)
        #print(label_fd)
        label=np.array(label_fd)
        label=label[:,1:]
        label=one_hot(label,n_class)
        self.label=label
        print("label:",label)
        #print(label)
        # convert labels
        """if config.loss_type == 'square_loss':
            self.label.append(np.float32(items[0]))
        else: # log_loss
            label = 1 if float(items[0]) > 0 else 0
            self.label.append(label)
        """
        
        """
        assert all(len(item) == len(self.features[0]
            ) for item in self.features), 'features are of different length'
        """
        print(len(self.features))
        print(len(self.feature_values))
        print(len(self.label))

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        label = self.label[idx]
        features = self.features[idx]
        feature_values = self.feature_values[idx]
        return features, feature_values, label

In [None]:
#############################  PREPARE DATASET #########################
"""
features_map, num_features = data_utils.map_features()

train_dataset = data_utils.FMData(config.train_libfm, features_map)
valid_dataset = data_utils.FMData(config.valid_libfm, features_map)
test_dataset = data_utils.FMData(config.test_libfm, features_map)

train_loader = data.DataLoader(train_dataset, drop_last=True,
            batch_size=args.batch_size, shuffle=True, num_workers=4)
valid_loader = data.DataLoader(valid_dataset,
            batch_size=args.batch_size, shuffle=False, num_workers=0)
test_loader = data.DataLoader(test_dataset,
            batch_size=args.batch_size, shuffle=False, num_workers=0)
"""

features_map,num_features=map_features()
print('num_features:',num_features)
#n_class=16
train_dataset = FMData(config.train_libfm,config.train_label,features_map)
train_loader = data.DataLoader(train_dataset, drop_last=True,batch_size=100,shuffle=True, num_workers=4)

validate_dataset = FMData(config.valid_libfm,config.valid_label,features_map)
validate_loader = data.DataLoader(train_dataset, drop_last=True,batch_size=100,shuffle=True, num_workers=4)

test_dataset = FMData(config.test_libfm,config.test_label,features_map)
test_loader = data.DataLoader(test_dataset, drop_last=True,batch_size=100,shuffle=True, num_workers=4)

In [None]:
##############################  CREATE MODEL ###########################
if args.pre_train:
    assert os.path.exists(config.FM_model_path), 'lack of FM model'
    assert config.model == 'NFM', 'only support NFM for now'
    FM_model = torch.load(config.FM_model_path)
else:
    FM_model = None

if config.model == 'FM':
    model = model.FM(num_features, args.hidden_factor,
                    args.batch_norm, eval(args.dropout))
else:
    """
    model = model.NFM(
        num_features, args.hidden_factor, 
        config.activation_function, eval(args.layers), 
        args.batch_norm, eval(args.dropout), args.n_class,FM_model)
    """
    model=network.NFM(config,[],train_dataset,args.n_class)#
model.cuda()
if config.optimizer == 'Adagrad':
    optimizer = optim.Adagrad(
        model.parameters(), lr=args.lr, initial_accumulator_value=1e-8)
elif config.optimizer == 'Adam':
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
elif config.optimizer == 'SGD':
    optimizer = optim.SGD(model.parameters(), lr=args.lr)
elif config.optimizer == 'Momentum':
    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.95)

if config.loss_type == 'square_loss':
    criterion = nn.MSELoss(reduction='sum')
elif config.loss_type=='cross_entropy_loss':#被修改，增加了交叉熵损失
    criterion=nn.CrossEntropyLoss(reduction='mean')
else: # log_loss
    criterion = nn.BCEWithLogitsLoss(reduction='sum')

# writer = SummaryWriter() # for visualization

In [None]:
###############################  TRAINING ############################
count, best_rmse = 0, 100
print(model)
for epoch in range(args.epochs):
    model.train() # Enable dropout and batch_norm
    start_time = time.time()

    for features, feature_values, label in train_loader:
        features = features.cuda()
        feature_values = feature_values.cuda()
        label = label.cuda()

        model.zero_grad()
        prediction = model(features, feature_values)
        loss = criterion(prediction, label) 
        loss += args.lamda * model.embeddings.weight.norm()
        loss.backward()
        optimizer.step()
        # writer.add_scalar('data/loss', loss.item(), count)
        count += 1

    model.eval()
    train_result = evaluate.metrics(model, train_loader)
    valid_result = evaluate.metrics(model, valid_loader)
    test_result = evaluate.metrics(model, test_loader)

    print("Runing Epoch {:03d} ".format(epoch) + "costs " + time.strftime(
                        "%H: %M: %S", time.gmtime(time.time()-start_time)))
    print("Train_RMSE: {:.3f}, Valid_RMSE: {:.3f}, Test_RMSE: {:.3f}".format(
                        train_result, valid_result, test_result))

    if test_result < best_rmse:
        best_rmse, best_epoch = test_result, epoch
        if args.out:
            if not os.path.exists(config.model_path):
                os.mkdir(config.model_path)
            torch.save(model, 
                '{}{}.pth'.format(config.model_path, config.model))

print("End. Best epoch {:03d}: Test_RMSE is {:.3f}".format(best_epoch, best_rmse))