In [1]:
import torch.nn
import torch.nn.functional
from torch.utils.data import Dataset, DataLoader
import torch.optim
import numpy.random
import numpy as np
import math
import pandas as pd
import copy
import time
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_columns', 500)

In [2]:
class Net(torch.nn.Module):

    def __init__(self, cat_cols, cont_cols, embeds):
        super(Net, self).__init__()
        self.cat_cols = cat_cols
        self.cont_cols = cont_cols
        self.embed = embeds
        
        #Embed the categoricals
        self.embedLayer = torch.nn.ModuleList([torch.nn.Embedding(i, 100) for i in self.embed])
        
        #normalize the numericals
        self.bn_layer = torch.nn.BatchNorm1d(len(self.cont_cols))
        
        # Linear Layers
        self.fc1 = torch.nn.Linear(len(self.embed) * 100 + len(cont_cols), 50)
        self.fc2 = torch.nn.Linear(50, 1)
    def forward(self, x):
        # Embedding Layer
        cat_encoded = [embedLayer(x[:, i+4].long()) for i, embedLayer in enumerate(self.embedLayer)]
        cat_encoded = torch.cat(cat_encoded, 1)
        cont_normalized = self.bn_layer(x[:, :4])
        x = torch.cat([cat_encoded, cont_normalized], 1)
        
        # Linear Layers
        x = torch.nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [3]:
class GDataset(Dataset):
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        item = self.df.iloc[idx, :]
        x = item[['visitNumber', 'visitStartTime', 'sessionQualityDim', 'timeOnSite', 'pageviews', 'newVisits', 
                  'bounces', 'channelGrouping', 'browser', 'operatingSystem', 'isMobile', 'deviceCategory', 
                  'continent', 'subContinent', 'country', 'region', 'campaign', 'source', 'medium', 'keyword', 
                  'isTrueDirect']].values.astype(np.float32)
        y = item[['bought']].values.astype(np.float32)
        return {'x': torch.from_numpy(x), 'y': torch.from_numpy(y)}

In [4]:
data = pd.read_csv("trainv2_10.csv")
data.drop('customDim', axis=1, inplace=True)

# tr = {'mean': 0.22711817076655114, 'std': 2.0037093202285647} 

cont_cols = ['visitNumber', 'visitStartTime', 'pageviews', 'timeOnSite']
cat_cols = ['newVisits', 'bounces', 'channelGrouping', 'browser', 'operatingSystem', 'isMobile', 
            'deviceCategory', 'continent', 'subContinent', 'country', 'region', 'campaign', 'source', 'medium', 
            'keyword', 'isTrueDirect']

#label encode the categorical variables
label_encoders = {}
for cat_col in cat_cols:
    label_encoders[cat_col] = LabelEncoder()
    data[cat_col] = label_encoders[cat_col].fit_transform(data[cat_col])

#create testing and training set
msk = numpy.random.rand(len(data)) < 0.8
training_data = data[msk]
testing_data = data[~msk]

batch_size = 1024

train_ds = GDataset(training_data)
test_ds = GDataset(testing_data)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=8)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True, num_workers=8)
dataloaders = {'train': train_dl, 'val': test_dl}
dataset_sizes = {'train': len(training_data), 'val': len(testing_data)}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

embeddings = [data[col].nunique() for col in cat_cols]
num_epochs=5

outputId = "fullVisitorId"
output = "bought"
net = Net(cat_cols, cont_cols, embeddings)
model = net.to(device)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.01)

print(net)

  interactivity=interactivity, compiler=compiler, result=result)


Net(
  (embedLayer): ModuleList(
    (0): Embedding(2, 100)
    (1): Embedding(2, 100)
    (2): Embedding(8, 100)
    (3): Embedding(44, 100)
    (4): Embedding(19, 100)
    (5): Embedding(2, 100)
    (6): Embedding(3, 100)
    (7): Embedding(6, 100)
    (8): Embedding(23, 100)
    (9): Embedding(207, 100)
    (10): Embedding(438, 100)
    (11): Embedding(30, 100)
    (12): Embedding(181, 100)
    (13): Embedding(7, 100)
    (14): Embedding(754, 100)
    (15): Embedding(2, 100)
  )
  (bn_layer): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=1604, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=1, bias=True)
)


In [5]:
best_model_wts = copy.deepcopy(model.state_dict())
best_loss = 50000

for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    print('-' * 10)

    # Each epoch has a training and validation phase
    for phase in ['train', 'val']:
        if phase == 'train':
            model.train()  # Set model to training mode
        else:
            model.eval()   # Set model to evaluate mode

        running_loss = 0.0

        # Iterate over data.
        for i, sample in enumerate(dataloaders[phase]):
            inputs, labels = sample['x'], sample['y']
            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            # track history if only in train
            with torch.set_grad_enabled(phase == 'train'):
                outputs = model(inputs)
                loss = torch.sqrt(criterion(outputs, labels))
#                 if(i % 10000 == 0):
                print("Loss at step {}: {}".format(i, loss/batch_size))

                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

            # statistics
            running_loss += loss.item()

        epoch_loss = running_loss / dataset_sizes[phase]

        print('{} Loss: {:.4f}'.format(
            phase, epoch_loss))

        # deep copy the model
        if phase == 'val' and epoch_loss < best_loss:
            best_loss = epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())


time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))
print('Best val Loss: {:4f}'.format(best_loss))

# load best model weights
model.load_state_dict(best_model_wts)

Epoch 0/4
----------


RuntimeError: CuDNN error: CUDNN_STATUS_INTERNAL_ERROR

In [None]:
def one_batch(idx):
    sample = train_ds[idx]
    inputs, labels = sample['x'], sample['y']
    inputs.to(device)
    labels.to(device)
    outputs = model(inputs)
    loss = torch.sqrt(criterion(outputs, labels))

In [None]:
%load_ext line_profiler

In [None]:
%lprun -f one_batch one_batch(0)

In [None]:
# #zero the gradient buffers
# net.zero_grad()

    
# # create the optimizer
# optimizer = torch.optim.SGD(net.parameters(), lr=0.01)
# criterion = torch.nn.MSELoss()

# BATCH_SIZE = 100
# # training loop
# for epoch in range(5):
#     print("Beginning epoch ", epoch)
#     order = numpy.random.choice(training_data.shape[0], size=training_data.shape[0], replace=False)
#     for i in range(0, len(order) - BATCH_SIZE, BATCH_SIZE):
#         miniBatch = training_data.iloc[order[i:i+BATCH_SIZE]]
#         miniBatch = miniBatch.to(device)
#         optimizer.zero_grad()   # zero the gradient buffers
#         loss = torch.sqrt(criterion(net(miniBatch), torch.tensor(miniBatch[output].values, dtype=torch.float32)))
#         if(i % 10000 == 0):
#             print("Loss at step {}: {}".format(i, loss))
#         loss.backward()
#         optimizer.step()    # Does the update

# #testing loop
# output = net(testing_data)

In [None]:
# #save output
# testing_data.loc[:,"Prediction"] = output.detach().tolist()
# testing_data.loc[:,[outputId, "Prediction"]].to_csv("output.csv")