In [1]:
import numpy as np
import torch.utils.data as data

import config


def read_features(file, features):
	""" Read features from the given file. """
	i = len(features)
	with open(file, 'r') as fd:
		line = fd.readline()
		while line:
			items = line.strip().split()
			for item in items[1:]:
				item = item.split(':')[0]
				if item not in features:
					features[item] = i
					i += 1
			line = fd.readline()
	return features

In [2]:
config.train_libfm="data/frappe/frappe.train.libfm"
config.valid_libfm="data/frappe/frappe.validation.libfm"
config.test_libfm="data/frappe/frappe.test.libfm"

In [5]:
features={}
features = read_features(config.train_libfm, features)
print(features)

{'451': 0, '4149': 1, '5041': 2, '5046': 3, '5053': 4, '5055': 5, '5058': 6, '5060': 7, '5069': 8, '5149': 9, '91': 10, '3503': 11, '5047': 12, '5056': 13, '5065': 14, '5095': 15, '168': 16, '983': 17, '5040': 18, '5050': 19, '5054': 20, '5207': 21, '620': 22, '1743': 23, '5045': 24, '5051': 25, '5061': 26, '5073': 27, '46': 28, '2692': 29, '5049': 30, '5086': 31, '5211': 32, '576': 33, '4933': 34, '5075': 35, '71': 36, '966': 37, '5043': 38, '5172': 39, '43': 40, '974': 41, '5048': 42, '5252': 43, '2928': 44, '5062': 45, '14': 46, '2396': 47, '5039': 48, '5076': 49, '107': 50, '4380': 51, '80': 52, '2662': 53, '5070': 54, '5243': 55, '190': 56, '1093': 57, '5052': 58, '5105': 59, '131': 60, '1432': 61, '5099': 62, '5215': 63, '116': 64, '986': 65, '5074': 66, '92': 67, '4253': 68, '16': 69, '1016': 70, '5059': 71, '5063': 72, '5156': 73, '38': 74, '2047': 75, '432': 76, '1060': 77, '488': 78, '957': 79, '5064': 80, '5110': 81, '87': 82, '3379': 83, '5162': 84, '516': 85, '3857': 86, '

In [3]:
def map_features():
	""" Get the number of existing features in all the three files. """
	features = {}
	features = read_features(config.train_libfm, features)
	features = read_features(config.valid_libfm, features)
	features = read_features(config.test_libfm, features)
	print("number of features: {}".format(len(features)))
	return features, len(features)

In [40]:
parser = argparse.ArgumentParser()
parser.add_argument("--lr", 
	type=float, 
	default=0.05, 
	help="learning rate")
parser.add_argument("--dropout", 
	default='[0.5, 0.2]',  
	help="dropout rate for FM and MLP")
parser.add_argument("--batch_size", 
	type=int, 
	default=128, 
	help="batch size for training")
parser.add_argument("--epochs", 
	type=int,
	default=100, 
	help="training epochs")
parser.add_argument("--hidden_factor", 
	type=int,
	default=64, 
	help="predictive factors numbers in the model")
parser.add_argument("--layers", 
	default='[64]', 
	help="size of layers in MLP model, '[]' is NFM-0")
parser.add_argument("--lamda", 
	type=float, 
	default=0.0, 
	help="regularizer for bilinear layers")
parser.add_argument("--batch_norm", 
	default=True, 
	help="use batch_norm or not")
parser.add_argument("--pre_train", 
	action='store_true', 
	default=False, 
	help="whether use the pre-train or not")
parser.add_argument("--out", 
	default=True, 
	help="save model or not")
parser.add_argument("--gpu", 
	type=str,
	default="0",  
	help="gpu card ID")
args = parser.parse_args()

os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
cudnn.benchmark = True


NameError: name 'argparse' is not defined

In [38]:
class FMData(data.Dataset):
    """ Construct the FM pytorch dataset. """
    def __init__(self, file, feature_map):
        super(FMData, self).__init__()
        self.label = []
        self.features = []
        self.feature_values = []

        with open(file, 'r') as fd:
            line = fd.readline()
            print("line:",line)
            
            while line:
                items = line.strip().split()
                #print(items)
                # convert features
                raw = [item.split(':')[0] for item in items[1:]]#featuremap 是一个字典的形式，包含key和value，feature[]中存储key，
                self.features.append(
                np.array([feature_map[item] for item in raw]))#feature_value[]中存储value
                self.feature_values.append(np.array([item.split(':')[1] for item in items[1:]], dtype=np.float32))

                # convert labels
                if config.loss_type == 'square_loss':
                    self.label.append(np.float32(items[0]))
                    #print("label:",self.label)
                else: # log_loss
                    label = 1 if float(items[0]) > 0 else 0
                    self.label.append(label)

                line = fd.readline()

        assert all(len(item) == len(self.features[0]
            ) for item in self.features), 'features are of different length'

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        label = self.label[idx]
        features = self.features[idx]
        feature_values = self.feature_values[idx]
        return features, feature_values, label


In [41]:
features_map, num_features = map_features()
print(num_features)


train_dataset = FMData(config.train_libfm, features_map)
train_loader = data.DataLoader(train_dataset, drop_last=True,
            batch_size=args.batch_size, shuffle=True, num_workers=4)

number of features: 5382
5382
line: -1 451:1 4149:1 5041:1 5046:1 5053:1 5055:1 5058:1 5060:1 5069:1 5149:1



NameError: name 'args' is not defined