# Embeddings from Continuous Data

In [1]:
! nvidia-smi

Wed Feb  1 19:51:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:06.0 Off |                    0 |
| N/A   26C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Import Libs

In [59]:
! pip install torchsummary

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1


In [57]:
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

In [60]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchvision import transforms
from torchsummary import summary

### Options

In [3]:
torch.manual_seed(42)

<torch._C.Generator at 0x7fb666f62378>

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

### Help Functions

In [6]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

### Data

In [7]:
# ! kaggle competitions download -c new-york-city-taxi-fare-prediction

In [8]:
# ! unzip ../temp_data/new-york-city-taxi-fare-prediction.zip

In [9]:
! ls -la ../data_ny/

total 7200344
drwxr-xr-x 3 root root       4096 Jan 29 15:17 .
drwxr-xr-x 7 root root       4096 Jan 29 15:17 ..
-rw-r--r-- 1 root root        486 Dec 12  2019 GCP-Coupons-Instructions.rtf
drwxr-xr-x 2 root root       4096 Jan 29 15:14 .ipynb_checkpoints
-rw-r--r-- 1 root root 1674619075 Jan 29 15:11 new-york-city-taxi-fare-prediction.zip
-rw-r--r-- 1 root root     343271 Dec 12  2019 sample_submission.csv
-rw-r--r-- 1 root root     983020 Dec 12  2019 test.csv
-rw-r--r-- 1 root root 5697178298 Dec 12  2019 train.csv


In [10]:
df_train = pd.read_csv('../data_ny/train.csv', nrows=100_000)

In [11]:
target = 'fare_amount'

In [12]:
df_train.shape

(100000, 8)

In [13]:
df_train['dist_km'] = haversine_distance(df_train, 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

In [14]:
df_train['EDTdate'] = pd.to_datetime(df_train['pickup_datetime'].str[:19]) - pd.Timedelta(hours=4)

In [15]:
df_train['Hour'] = df_train['EDTdate'].dt.hour

In [16]:
df_train['AMorPM'] = np.where(df_train['Hour']<12,'am','pm')

In [17]:
df_train['Weekday'] = df_train['EDTdate'].dt.strftime("%a")

In [18]:
df_train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM,Weekday
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1,1.030764,2009-06-15 13:26:21,13,pm,Mon
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1,8.450134,2010-01-05 12:52:16,12,pm,Tue
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2,1.389525,2011-08-17 20:35:00,20,pm,Wed
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1,2.79927,2012-04-21 00:30:42,0,am,Sat
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1,1.999157,2010-03-09 03:51:00,3,am,Tue


In [19]:
df_train.columns

Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km', 'EDTdate', 'Hour', 'AMorPM', 'Weekday'],
      dtype='object')

In [20]:
cat_cols = ['Hour', 'AMorPM', 'Weekday']
cont_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'passenger_count', 'dist_km']

### Label Encoding

In [21]:
for col in df_train.columns:
    if col in cat_cols:
        df_train[col] = LabelEncoder().fit_transform(df_train[col])
        df_train[col] = df_train[col].astype('category')

### Normalization

In [22]:
def normalize(data, cont_columns):
    means, stds = {}, {}
    for n in cont_columns:
        assert is_numeric_dtype(data[n]), (f"""Cannot normalize '{n}' column as it isn't numerical. Are you sure it doesn't belong in the categorical set of columns?""")
        means[n], stds[n] = data[n].mean(), data[n].std()
        data[n] = (data[n] - means[n]) / (1e-7 + stds[n])
    return data

In [23]:
def normalize_v2(input_data, cont_columns):
    data = input_data.copy()
    for col in cont_columns:
        arr = data[col].to_numpy().reshape(-1, 1)
        scaled_arr = MinMaxScaler(feature_range=(0, 1)).fit_transform(arr)  
        data[col] = scaled_arr
    return data

In [24]:
columns = ['Hour', 'AMorPM', 'Weekday', 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'passenger_count', 'dist_km']

In [25]:
X = df_train[cont_cols]

In [26]:
y = df_train[target]

In [27]:
X.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,passenger_count,dist_km
0,40.721319,-73.844311,40.712278,-73.84161,1,1.030764
1,40.711303,-74.016048,40.782004,-73.979268,1,8.450134
2,40.76127,-73.982738,40.750562,-73.991242,2,1.389525
3,40.733143,-73.98713,40.758092,-73.991567,1,2.79927
4,40.768008,-73.968095,40.783762,-73.956655,1,1.999157


In [28]:
X = normalize_v2(X, cont_cols)

In [29]:
X.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,passenger_count,dist_km
0,0.241488,0.852533,0.239685,0.086153,0.166667,0.000119
1,0.241467,0.852312,0.23983,0.085056,0.166667,0.000974
2,0.241573,0.852355,0.239765,0.084961,0.333333,0.00016
3,0.241513,0.852349,0.23978,0.084958,0.166667,0.000323
4,0.241587,0.852373,0.239834,0.085236,0.166667,0.00023


In [113]:
y.head()

0     4.5
1    16.9
2     5.7
3     7.7
4     5.3
Name: fare_amount, dtype: float64

In [30]:
X.isnull().sum()

pickup_latitude      0
pickup_longitude     0
dropoff_latitude     0
dropoff_longitude    0
passenger_count      0
dist_km              0
dtype: int64

In [31]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True) 

In [32]:
X_train.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,passenger_count,dist_km
75220,0.241379,0.85235,0.239607,0.08482,0.166667,0.000233
48955,0.241607,0.852387,0.23945,0.085135,0.166667,0.002282
44966,0.241625,0.85236,0.239808,0.08503,0.166667,0.000192
13568,0.155776,0.947529,0.154624,0.674507,0.166667,0.0
92727,0.24157,0.85235,0.239827,0.085108,0.166667,0.000294


### Model

In [33]:
ts_X_train = torch.tensor(X_train.values.astype(np.float32), dtype=torch.float32) 
ts_y_train = torch.tensor(y_train.values.astype(np.float32), dtype=torch.float32)

In [34]:
ts_X_val = torch.tensor(X_val.values.astype(np.float32), dtype=torch.float32) 
ts_y_val = torch.tensor(y_val.values.astype(np.float32), dtype=torch.float32)

In [35]:
ds_train = TensorDataset(ts_X_train, ts_y_train)

In [36]:
ds_val = TensorDataset(ts_X_val, ts_y_val)

In [37]:
input_size = len(ts_X_train[0])
input_size

6

In [38]:
len(ts_X_train)

80000

In [39]:
len(ts_X_val)

20000

In [40]:
batch_size = 32

In [41]:
mean, std = (0.5,), (0.5,)
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize(mean, std)
                               ])

In [42]:
dl_train = DataLoader(dataset=ds_train, batch_size=batch_size, shuffle=True, num_workers=1)

In [43]:
dl_val = DataLoader(dataset=ds_val, batch_size=batch_size, shuffle=False, num_workers=1)

### PyTorch

In [82]:
class MLP(nn.Module):
    '''
    Multilayer Perceptron.
    '''
    def __init__(self, input_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )


    def forward(self, x):
        '''Forward pass'''
        # x = F.normalize(x, dim = 0)
        return self.layers(x)

In [83]:
model = MLP(input_size=input_size).to(device)

In [84]:
input_size

6

In [85]:
print(model)

MLP(
  (layers): Sequential(
    (0): Linear(in_features=6, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=1, bias=True)
  )
)


In [86]:
summary(model, (1, input_size))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                [-1, 1, 64]             448
              ReLU-2                [-1, 1, 64]               0
            Linear-3                [-1, 1, 32]           2,080
              ReLU-4                [-1, 1, 32]               0
            Linear-5                 [-1, 1, 1]              33
Total params: 2,561
Trainable params: 2,561
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.01
Estimated Total Size (MB): 0.01
----------------------------------------------------------------


In [87]:
loss_function = nn.L1Loss()

In [88]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [94]:
def train_loop(model, data_loader, loss_function, optimizer, n_epochs, device):

    history_loss = []
    
    # Run the training loop
    for epoch in range(n_epochs): # 5 epochs at maximum
        print(f'Starting epoch {epoch + 1}')
    
        # Set current loss value
        current_loss = 0.0
    
        # Iterate over the DataLoader for training data
        for i, data in enumerate(data_loader, 0):
            # Get inputs
            inputs, targets = data
            inputs = inputs.to(device)
            targets = targets.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Perform forward pass
            outputs = model(inputs)

            # Compute loss
            loss = loss_function(outputs, targets.unsqueeze(-1))

            # Perform backward pass
            loss.backward()

            # Perform optimization
            optimizer.step()

            # Print statistics
            current_loss += loss.item()
            
            if i % 500 == 0:
                print('Loss after mini-batch %5d: %.3f' % (i + 1, current_loss / 500))
                current_loss = 0.0
        
    # Process is complete.
    print('Training process has finished.')
    
    return model, history_loss

In [95]:
model_ft, history_loss = train_loop(model=model, data_loader=dl_train, loss_function=loss_function, 
                                    optimizer=optimizer, n_epochs=5, device=device)

Starting epoch 1
Loss after mini-batch     1: 0.008
Loss after mini-batch   501: 5.328
Loss after mini-batch  1001: 5.558
Loss after mini-batch  1501: 5.262
Loss after mini-batch  2001: 5.428
Starting epoch 2
Loss after mini-batch     1: 0.013
Loss after mini-batch   501: 5.394
Loss after mini-batch  1001: 5.431
Loss after mini-batch  1501: 5.398
Loss after mini-batch  2001: 5.431
Starting epoch 3
Loss after mini-batch     1: 0.012
Loss after mini-batch   501: 5.489
Loss after mini-batch  1001: 5.364
Loss after mini-batch  1501: 5.429
Loss after mini-batch  2001: 5.408
Starting epoch 4
Loss after mini-batch     1: 0.014
Loss after mini-batch   501: 5.432
Loss after mini-batch  1001: 5.349
Loss after mini-batch  1501: 5.416
Loss after mini-batch  2001: 5.365
Starting epoch 5
Loss after mini-batch     1: 0.010
Loss after mini-batch   501: 5.398
Loss after mini-batch  1001: 5.411
Loss after mini-batch  1501: 5.379
Loss after mini-batch  2001: 5.365
Training process has finished.


In [108]:
list(model_ft.children())

[Sequential(
   (0): Linear(in_features=6, out_features=64, bias=True)
   (1): ReLU()
   (2): Linear(in_features=64, out_features=32, bias=True)
   (3): ReLU()
   (4): Linear(in_features=32, out_features=1, bias=True)
 )]

In [126]:
feature_extractor = torch.nn.Sequential(*list(model_ft.children())[:-1])

In [127]:
x = torch.tensor([0.241570, 0.852350, 0.239827, 0.085108, 0.166667, 0.000294]).to(device)
x

tensor([2.4157e-01, 8.5235e-01, 2.3983e-01, 8.5108e-02, 1.6667e-01, 2.9400e-04],
       device='cuda:0')

In [128]:
output = feature_extractor(x)
output

tensor([2.4157e-01, 8.5235e-01, 2.3983e-01, 8.5108e-02, 1.6667e-01, 2.9400e-04],
       device='cuda:0')

In [129]:
model(x)

tensor([8.4843], device='cuda:0', grad_fn=<AddBackward0>)