##### $\hspace{15pt}$ **Filename: tensorRepresentation.ipynb**
##### $\hspace{1.5pt}$ **Date Created: November 2, 2023**
##### **Date Modified: January 13, 2024**
##### $\rule{10.5in}{1pt}$
##### **Load different types of data and represent them as tensors.**

##### **The content of this notebook is based on the examples provided in chapter 4 of the book [Deep Learning with Pytorch](https://www.manning.com/books/deep-learning-with-pytorch). Some changes were made in the presentation of the content as well as in the images and datasets that were loaded for demonstration purposes. The various files that have to be accessed are available in this [Google Drive folder](https://drive.google.com/drive/folders/1XMOJMwqGXoP_njc2ktH6FwokouyvRPXx?usp=sharing). Before running this notebook in Colab, either change the path in cell `5`, or create the subfolder `/Colab Notebooks/005_tensorRepresentation` in your Google drive and copy the files to the subfolder.**
##### $\rule{10.5in}{1pt}$

##### Load modules and packages.

In [1]:
from datetime import datetime
from google.colab import drive
from sklearn.preprocessing import StandardScaler
import imageio.v2 as imageio
import numpy as np
import os
import pandas as pd
import torch

##### Set pandas dataframes to display all the columns.

In [2]:
pd.set_option("display.max_columns", None)

##### Set the float format of pandas dataframes.

In [3]:
pd.options.display.float_format = "{:.4f}".format

##### Mount Google Drive to Colab.

In [4]:
drive.mount("/content/gdrive")

Mounted at /content/gdrive


##### Set the path to access the files needed by this notebook.

In [5]:
path = "gdrive/MyDrive/Colab Notebooks/005_tensorRepresentation/"

$\hspace{1in}$

##### **Single 2D Image**

###### Load an image from Google Drive as a numpy array. The image was taken from this [Kaggle dataset](https://www.kaggle.com/datasets/alessiocorrado99/animals10).

In [6]:
imageArray = imageio.imread(path + "cat.jpg")
type(imageArray)

numpy.ndarray

In [7]:
imageArray.shape

(200, 300, 3)

##### Convert the array to a tensor, and change its layout.

In [8]:
imageTensor = torch.from_numpy(imageArray)
type(imageTensor)

torch.Tensor

In [9]:
imageTensor = imageTensor.permute(2, 0, 1)
imageTensor.shape

torch.Size([3, 200, 300])

In [10]:
imageTensor

tensor([[[135, 134, 132,  ..., 135, 135, 135],
         [137, 136, 133,  ..., 133, 133, 133],
         [140, 139, 136,  ..., 131, 131, 131],
         ...,
         [187, 184, 180,  ..., 111, 111, 112],
         [186, 186, 187,  ..., 112, 112, 113],
         [185, 189, 194,  ..., 112, 113, 113]],

        [[129, 128, 128,  ..., 143, 143, 143],
         [131, 130, 129,  ..., 141, 141, 141],
         [134, 133, 132,  ..., 139, 139, 139],
         ...,
         [145, 142, 139,  ..., 118, 118, 119],
         [144, 144, 146,  ..., 119, 119, 120],
         [143, 147, 153,  ..., 119, 120, 120]],

        [[141, 140, 142,  ..., 156, 156, 156],
         [143, 142, 143,  ..., 154, 154, 154],
         [146, 145, 146,  ..., 152, 152, 152],
         ...,
         [ 87,  84,  83,  ..., 128, 128, 129],
         [ 86,  86,  90,  ..., 129, 129, 130],
         [ 85,  89,  97,  ..., 129, 130, 130]]], dtype=torch.uint8)

$\hspace{1in}$

##### **Multiple 2D Images**

##### List the image files. The image files were taken from this [Kaggle dataset](https://www.kaggle.com/datasets/alessiocorrado99/animals10).

In [11]:
directory = path + "squirrelImages/"
imageFiles = os.listdir(directory)
imageFiles

['squirrel1.jpg', 'squirrel2.jpg', 'squirrel3.jpg', 'squirrel4.jpg']

##### Get the minimum on each dimension of the shapes of the array representations of the 4 images.

In [12]:
shapes = []
for imageFile in imageFiles:

    imageArray = imageio.imread(os.path.join(directory, imageFile))
    shapes.append(imageArray.shape)

minimumDimension = tuple([min(x) for x in list(zip(*shapes))])
minimumDimension

(230, 234, 3)

##### Convert each array to a tensor, change its layout, and place the 4 tensors in a single tensor.

In [13]:
batchTensor = torch.zeros(4, 3, 230, 234, dtype = torch.uint8)

for i, imageFile in enumerate(imageFiles):

    imageArray = imageio.imread(os.path.join(directory, imageFile))
    imageTensor = torch.from_numpy(imageArray)
    imageTensor = imageTensor.permute(2, 0, 1)
    imageTensor = imageTensor[:, :230, :234] # <1>
    batchTensor[i] = imageTensor

In [14]:
batchTensor.shape

torch.Size([4, 3, 230, 234])

In [15]:
batchTensor

tensor([[[[ 41,  39,  38,  ...,  90,  86,  86],
          [ 41,  39,  38,  ...,  87,  82,  82],
          [ 41,  39,  38,  ...,  85,  82,  82],
          ...,
          [ 69,  81,  65,  ...,  90,  33,   1],
          [ 75,  56,  59,  ...,  78,  80, 118],
          [105,  72,  48,  ..., 148, 167, 210]],

         [[ 64,  62,  61,  ..., 153, 149, 149],
          [ 64,  62,  61,  ..., 150, 145, 145],
          [ 64,  62,  61,  ..., 146, 143, 143],
          ...,
          [ 75,  87,  72,  ..., 100,  40,   7],
          [ 81,  62,  66,  ...,  86,  85, 122],
          [111,  78,  55,  ..., 155, 170, 212]],

         [[ 22,  20,  19,  ...,  13,   9,   9],
          [ 22,  20,  19,  ...,  10,   5,   5],
          [ 20,  18,  17,  ...,   9,   6,   6],
          ...,
          [ 49,  59,  41,  ...,  29,   0,   0],
          [ 55,  34,  35,  ...,   9,  29,  72],
          [ 85,  50,  24,  ...,  77, 113, 163]]],


        [[[240, 239, 238,  ..., 244, 244, 244],
          [240, 239, 238,  ..., 242

##### Cast the tensor to floating point, and standardize.

In [16]:
batchTensor = batchTensor.float()
nChannels = batchTensor.shape[1]

for c in range(nChannels):

    mean = torch.mean(batchTensor[:, c])
    std = torch.std(batchTensor[:, c])
    batchTensor[:, c] = (batchTensor[:, c] - mean)/std

batchTensor

tensor([[[[-1.1735e+00, -1.2007e+00, -1.2143e+00,  ..., -5.0565e-01,
           -5.6016e-01, -5.6016e-01],
          [-1.1735e+00, -1.2007e+00, -1.2143e+00,  ..., -5.4653e-01,
           -6.1468e-01, -6.1468e-01],
          [-1.1735e+00, -1.2007e+00, -1.2143e+00,  ..., -5.7379e-01,
           -6.1468e-01, -6.1468e-01],
          ...,
          [-7.9185e-01, -6.2830e-01, -8.4637e-01,  ..., -5.0565e-01,
           -1.2825e+00, -1.7186e+00],
          [-7.1008e-01, -9.6902e-01, -9.2814e-01,  ..., -6.6919e-01,
           -6.4193e-01, -1.2404e-01],
          [-3.0121e-01, -7.5096e-01, -1.0781e+00,  ...,  2.8482e-01,
            5.4377e-01,  1.1298e+00]],

         [[-6.5167e-01, -6.7948e-01, -6.9338e-01,  ...,  5.8590e-01,
            5.3028e-01,  5.3028e-01],
          [-6.5167e-01, -6.7948e-01, -6.9338e-01,  ...,  5.4419e-01,
            4.7466e-01,  4.7466e-01],
          [-6.5167e-01, -6.7948e-01, -6.9338e-01,  ...,  4.8857e-01,
            4.4685e-01,  4.4685e-01],
          ...,
     

$\hspace{1in}$

##### **3D Image**

##### Load DICOM files from Google Drive as an array. The files were downloaded from this [folder](https://github.com/deep-learning-with-pytorch/dlwpt-code/tree/master/data/p1ch4/volumetric-dicom/2-LUNG%203.0%20%20B70f-04083) in the GitHub repo of the book [Deep Learning with Pytorch](https://www.manning.com/books/deep-learning-with-pytorch).

---



In [17]:
directory = path + "dicomFiles"
imageArray = imageio.volread(directory, 'DICOM')

Reading DICOM (examining files): 1/99 files (1.0%)2/99 files (2.0%)3/99 files (3.0%)4/99 files (4.0%)5/99 files (5.1%)6/99 files (6.1%)7/99 files (7.1%)8/99 files (8.1%)9/99 files (9.1%)10/99 files (10.1%)11/99 files (11.1%)12/99 files (12.1%)13/99 files (13.1%)14/99 files (14.1%)15/99 files (15.2%)17/99 files (17.2%)18/99 files (18.2%)19/99 files (19.2%)20/99 files (20.2%)21/99 files (21.2%)22/99 files (22.2%)23/99 files (23.2%)24/99 files (24.2%)25/99 files (25.3%)26/99 files (26.3%)27/99 files (27.3%)28/99 files (28

In [18]:
type(imageArray)

imageio.core.util.Array

In [19]:
imageArray.shape

(99, 512, 512)

##### Convert the array to a tensor, cast it to floating point, and add an extra dimension.

In [20]:
imageTensor = torch.from_numpy(imageArray).float()
imageTensor = torch.unsqueeze(imageTensor, 0)
type(imageTensor)

torch.Tensor

In [21]:
imageTensor.shape

torch.Size([1, 99, 512, 512])

In [22]:
imageTensor

tensor([[[[ -985.,  -990.,  -999.,  ..., -1017., -1008.,  -971.],
          [-1016.,  -984.,  -963.,  ..., -1000., -1009.,  -999.],
          [-1024., -1008.,  -996.,  ...,  -979., -1021.,  -987.],
          ...,
          [ -920.,  -942.,  -944.,  ...,  -893.,  -917.,  -955.],
          [ -871.,  -879.,  -905.,  ...,  -895.,  -869.,  -867.],
          [ -876.,  -855.,  -873.,  ...,  -933.,  -982.,  -936.]],

         [[ -982.,  -989., -1020.,  ...,  -988.,  -977.,  -980.],
          [-1015.,  -969.,  -980.,  ...,  -981.,  -974., -1016.],
          [-1020.,  -978.,  -972.,  ...,  -988.,  -993., -1015.],
          ...,
          [ -880.,  -854.,  -924.,  ...,  -909.,  -829.,  -829.],
          [ -907.,  -868.,  -928.,  ...,  -909.,  -901.,  -894.],
          [ -885.,  -850.,  -899.,  ...,  -977.,  -964.,  -991.]],

         [[-1022., -1011.,  -983.,  ..., -1013.,  -980.,  -967.],
          [-1022., -1005., -1012.,  ...,  -948.,  -966., -1008.],
          [ -991.,  -965.,  -993.,  ...,  

$\hspace{1in}$

##### **Tabular Data**

##### Load a tabular dataset from Google Drive. The dataset is the training set of the[ heart disease prediction dataset](https://www.kaggle.com/datasets/moazeldsokyx/heart-disease) from Kaggle.

In [23]:
tabularData = pd.read_csv(path + "train.csv")
tabularData.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


##### Extract the input data from the tabular dataset.

In [24]:
inputData = tabularData.drop(columns = ["target"])
inputData.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2


##### Extract the target data from the tabular dataset.

In [25]:
targetData = tabularData[["target"]]
targetData.head()

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0


##### **Case I: The nonbinary categorical input variables are ordinal, and ordering is a priority.**

In [26]:
scaler = StandardScaler()
inputData_v1 = inputData.copy()
inputData_v1[["cp", "restecg", "slope", "ca", "thal"]] = scaler.fit_transform(inputData_v1[["cp", "restecg", "slope", "ca", "thal"]])
inputData_v1.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,-0.9158,125,212,0,0.8913,168,0,1.0,0.9954,1.2092,1.0899
1,53,1,-0.9158,140,203,1,-1.004,155,1,3.1,-2.2437,-0.732,1.0899
2,70,1,-0.9158,145,174,0,0.8913,125,1,2.6,-2.2437,-0.732,1.0899
3,61,1,-0.9158,148,203,0,0.8913,161,0,0.0,0.9954,0.2386,1.0899
4,62,0,-0.9158,138,294,1,0.8913,106,0,1.9,-0.6241,2.1798,-0.5221


##### **Case II: The nonbinary categorical input variables are not ordinal, or, if they are ordinal, ordering is not a priority.**
##### Build a one-hot encoding of each of the nonbinary categorical input variables.

In [27]:
inputData_v2 = pd.get_dummies(inputData, columns = ["cp", "restecg", "slope", "ca", "thal"])
inputData_v2.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,cp_0,cp_1,cp_2,cp_3,restecg_0,restecg_1,restecg_2,slope_0,slope_1,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,52,1,125,212,0,168,0,1.0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1
1,53,1,140,203,1,155,1,3.1,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1
2,70,1,145,174,0,125,1,2.6,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1
3,61,1,148,203,0,161,0,0.0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1
4,62,0,138,294,1,106,0,1.9,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0


##### Standardize each of the continuous input variables.

In [28]:
inputData_v1[["age", "trestbps", "chol", "thalach", "oldpeak"]] = scaler.fit_transform(inputData_v1[["age", "trestbps", "chol", "thalach", "oldpeak"]])
inputData_v1.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,-0.2684,1,-0.9158,-0.3776,-0.6593,0,0.8913,0.8213,0,-0.0609,0.9954,1.2092,1.0899
1,-0.1582,1,-0.9158,0.4791,-0.8339,1,-1.004,0.256,1,1.7271,-2.2437,-0.732,1.0899
2,1.7166,1,-0.9158,0.7647,-1.3962,0,0.8913,-1.0487,1,1.3014,-2.2437,-0.732,1.0899
3,0.7241,1,-0.9158,0.936,-0.8339,0,0.8913,0.5169,0,-0.9123,0.9954,0.2386,1.0899
4,0.8344,0,-0.9158,0.3649,0.9308,1,0.8913,-1.875,0,0.7054,-0.6241,2.1798,-0.5221


In [29]:
inputData_v2[["age", "trestbps", "chol", "thalach", "oldpeak"]] = scaler.fit_transform(inputData_v2[["age", "trestbps", "chol", "thalach", "oldpeak"]])
inputData_v2.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,cp_0,cp_1,cp_2,cp_3,restecg_0,restecg_1,restecg_2,slope_0,slope_1,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,-0.2684,1,-0.3776,-0.6593,0,0.8213,0,-0.0609,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1
1,-0.1582,1,0.4791,-0.8339,1,0.256,1,1.7271,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1
2,1.7166,1,0.7647,-1.3962,0,-1.0487,1,1.3014,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1
3,0.7241,1,0.936,-0.8339,0,0.5169,0,-0.9123,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1
4,0.8344,0,0.3649,0.9308,1,-1.875,0,0.7054,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0


##### Convert the input data to a numpy array.

In [30]:
inputArray_v1 = inputData_v1.to_numpy(dtype = np.float32)
type(inputArray_v1)

numpy.ndarray

In [31]:
inputArray_v1.shape

(1025, 13)

In [32]:
inputArray_v2 = inputData_v2.to_numpy(dtype = np.float32)
type(inputArray_v2)

numpy.ndarray

In [33]:
inputArray_v2.shape

(1025, 27)

##### Convert the target data to a numpy array.

In [34]:
targetArray = targetData.to_numpy(dtype = np.float32)
type(targetArray)

numpy.ndarray

In [35]:
targetArray.shape

(1025, 1)

##### Convert the input array to a tensor.

In [36]:
inputTensor_v1 = torch.from_numpy(inputArray_v1)
type(inputTensor_v1)

torch.Tensor

In [37]:
inputTensor_v2 = torch.from_numpy(inputArray_v2)
type(inputTensor_v2)

torch.Tensor

In [38]:
inputTensor_v1

tensor([[-0.2684,  1.0000, -0.9158,  ...,  0.9954,  1.2092,  1.0899],
        [-0.1582,  1.0000, -0.9158,  ..., -2.2437, -0.7320,  1.0899],
        [ 1.7166,  1.0000, -0.9158,  ..., -2.2437, -0.7320,  1.0899],
        ...,
        [-0.8198,  1.0000, -0.9158,  ..., -0.6241,  0.2386, -0.5221],
        [-0.4890,  0.0000, -0.9158,  ...,  0.9954, -0.7320, -0.5221],
        [-0.0479,  1.0000, -0.9158,  ..., -0.6241,  0.2386,  1.0899]])

In [39]:
inputTensor_v2

tensor([[-0.2684,  1.0000, -0.3776,  ...,  0.0000,  0.0000,  1.0000],
        [-0.1582,  1.0000,  0.4791,  ...,  0.0000,  0.0000,  1.0000],
        [ 1.7166,  1.0000,  0.7647,  ...,  0.0000,  0.0000,  1.0000],
        ...,
        [-0.8198,  1.0000, -1.2344,  ...,  0.0000,  1.0000,  0.0000],
        [-0.4890,  0.0000, -1.2344,  ...,  0.0000,  1.0000,  0.0000],
        [-0.0479,  1.0000, -0.6632,  ...,  0.0000,  0.0000,  1.0000]])

##### Convert the target array to a tensor.

In [40]:
targetTensor = torch.from_numpy(targetArray)
type(targetTensor)

torch.Tensor

In [41]:
targetTensor

tensor([[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [1.],
        [0.]])

$\hspace{1in}$

##### **Time Series Data**

##### Load a time series dataset from Google Drive. The dataset is the [air quality dataset](https://www.kaggle.com/datasets/tawfikelmetwally/air-quality-dataset) from Kaggle.

In [42]:
timeSeriesData = pd.read_csv(path + "airQuality.csv")
timeSeriesData

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,10/03/2004,18:00:00,2.6000,1360,150,11.9000,1046,166,1056,113,1692,1268,13.6000,48.9000,0.7578
1,10/03/2004,19:00:00,2.0000,1292,112,9.4000,955,103,1174,92,1559,972,13.3000,47.7000,0.7255
2,10/03/2004,20:00:00,2.2000,1402,88,9.0000,939,131,1140,114,1555,1074,11.9000,54.0000,0.7502
3,10/03/2004,21:00:00,2.2000,1376,80,9.2000,948,172,1092,122,1584,1203,11.0000,60.0000,0.7867
4,10/03/2004,22:00:00,1.6000,1272,51,6.5000,836,131,1205,116,1490,1110,11.2000,59.6000,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,04/04/2005,10:00:00,3.1000,1314,-200,13.5000,1101,472,539,190,1374,1729,21.9000,29.3000,0.7568
9353,04/04/2005,11:00:00,2.4000,1163,-200,11.4000,1027,353,604,179,1264,1269,24.3000,23.7000,0.7119
9354,04/04/2005,12:00:00,2.4000,1142,-200,12.4000,1063,293,603,175,1241,1092,26.9000,18.3000,0.6406
9355,04/04/2005,13:00:00,2.1000,1003,-200,9.5000,961,235,702,156,1041,770,28.3000,13.5000,0.5139


##### Check if every date from 10/03/2004 to 04/04/2005 is recorded in the dataset.

In [43]:
timeSeriesDataDateList = np.unique(timeSeriesData.Date.values.tolist())

startDate = datetime.strptime("10/03/2004", "%d/%m/%Y")
endDate = datetime.strptime("04/04/2005", "%d/%m/%Y")
generatedDateList = pd.date_range(startDate, endDate, freq = "D").strftime("%d/%m/%Y").tolist()

set(timeSeriesDataDateList) == set(generatedDateList)

True

##### Count the recorded hours for every date that is also recorded in the dataset.

In [44]:
hoursCount = pd.DataFrame(timeSeriesData.groupby(["Date"], as_index = False)["Time"].count())
hoursCount.rename(columns = {"Time": "Time Count"}, inplace = True)
hoursCount = timeSeriesData[["Date"]].drop_duplicates().merge(hoursCount)
hoursCount

Unnamed: 0,Date,Time Count
0,10/03/2004,6
1,11/03/2004,24
2,12/03/2004,24
3,13/03/2004,24
4,14/03/2004,24
...,...,...
386,31/03/2005,24
387,01/04/2005,24
388,02/04/2005,24
389,03/04/2005,24


##### Get all records with dates that have complete 24 hours of data.

In [45]:
timeSeriesData = hoursCount.merge(timeSeriesData)
timeSeriesData = timeSeriesData[timeSeriesData["Time Count"] == 24].copy()
timeSeriesData.drop(columns = ["Time Count"], inplace = True)
timeSeriesData.reset_index(drop = True, inplace = True)
timeSeriesData

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,11/03/2004,0:00:00,1.2000,1185,31,3.6000,690,62,1462,77,1333,733,11.3000,56.8000,0.7603
1,11/03/2004,1:00:00,1.0000,1136,31,3.3000,672,62,1453,76,1333,730,10.7000,60.0000,0.7702
2,11/03/2004,2:00:00,0.9000,1094,24,2.3000,609,45,1579,60,1276,620,10.7000,59.7000,0.7648
3,11/03/2004,3:00:00,0.6000,1010,19,1.7000,561,-200,1705,-200,1235,501,10.3000,60.2000,0.7517
4,11/03/2004,4:00:00,-200.0000,1011,14,1.3000,527,21,1818,34,1197,445,10.1000,60.5000,0.7465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9331,03/04/2005,19:00:00,2.7000,1248,-200,11.1000,1018,367,599,181,1289,1167,19.9000,33.0000,0.7608
9332,03/04/2005,20:00:00,2.5000,1180,-200,7.9000,894,355,636,187,1200,1372,17.5000,40.7000,0.8073
9333,03/04/2005,21:00:00,1.5000,1102,-200,6.0000,812,235,693,158,1178,1042,16.4000,46.6000,0.8642
9334,03/04/2005,22:00:00,1.6000,1116,-200,5.8000,803,233,696,153,1173,1055,15.5000,49.0000,0.8579


##### Get the day component of the date variable and the hour component of the time variable.

In [46]:
timeSeriesData["Date"] = [datetime.strptime(x, "%d/%m/%Y").date().day for x in timeSeriesData["Date"]]
timeSeriesData["Time"] = [datetime.strptime(x, "%H:%M:%S").time().hour for x in timeSeriesData["Time"]]

timeSeriesData.rename(columns = {"Date": "Day", "Time": "Hour"}, inplace = True)
timeSeriesData

Unnamed: 0,Day,Hour,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,11,0,1.2000,1185,31,3.6000,690,62,1462,77,1333,733,11.3000,56.8000,0.7603
1,11,1,1.0000,1136,31,3.3000,672,62,1453,76,1333,730,10.7000,60.0000,0.7702
2,11,2,0.9000,1094,24,2.3000,609,45,1579,60,1276,620,10.7000,59.7000,0.7648
3,11,3,0.6000,1010,19,1.7000,561,-200,1705,-200,1235,501,10.3000,60.2000,0.7517
4,11,4,-200.0000,1011,14,1.3000,527,21,1818,34,1197,445,10.1000,60.5000,0.7465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9331,3,19,2.7000,1248,-200,11.1000,1018,367,599,181,1289,1167,19.9000,33.0000,0.7608
9332,3,20,2.5000,1180,-200,7.9000,894,355,636,187,1200,1372,17.5000,40.7000,0.8073
9333,3,21,1.5000,1102,-200,6.0000,812,235,693,158,1178,1042,16.4000,46.6000,0.8642
9334,3,22,1.6000,1116,-200,5.8000,803,233,696,153,1173,1055,15.5000,49.0000,0.8579


##### Standardize each of the variables except the Day and Hour variables.

In [47]:
columns = timeSeriesData.columns.tolist()
columns.remove("Day")
columns.remove("Hour")

scaler = StandardScaler()
timeSeriesData[columns] = scaler.fit_transform(timeSeriesData[columns])
timeSeriesData

Unnamed: 0,Day,Hour,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,11,0,0.4564,0.4128,1.3605,0.0422,-0.5975,-0.4134,2.0716,0.1495,-0.1255,-0.5294,0.0355,0.3380,0.1952
1,11,1,0.4539,0.2644,1.3605,0.0350,-0.6500,-0.4134,2.0436,0.1416,-0.1255,-0.5359,0.0216,0.4005,0.1954
2,11,2,0.4526,0.1371,1.3104,0.0108,-0.8340,-0.4794,2.4349,0.0156,-0.2475,-0.7766,0.0216,0.3946,0.1953
3,11,3,0.4487,-0.1174,1.2747,-0.0036,-0.9741,-1.4307,2.8261,-2.0312,-0.3352,-1.0369,0.0124,0.4044,0.1950
4,11,4,-2.1329,-0.1144,1.2389,-0.0133,-1.0734,-0.5726,3.1769,-0.1890,-0.4164,-1.1594,0.0077,0.4102,0.1948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9331,3,19,0.4758,0.6037,-0.2920,0.2233,0.3602,0.7707,-0.6081,0.9682,-0.2197,0.4201,0.2343,-0.1262,0.1952
9332,3,20,0.4732,0.3977,-0.2920,0.1460,-0.0019,0.7241,-0.4932,1.0155,-0.4100,0.8686,0.1788,0.0240,0.1964
9333,3,21,0.4603,0.1613,-0.2920,0.1002,-0.2413,0.2582,-0.3162,0.7871,-0.4571,0.1467,0.1534,0.1391,0.1978
9334,3,22,0.4616,0.2038,-0.2920,0.0953,-0.2676,0.2505,-0.3069,0.7478,-0.4678,0.1751,0.1326,0.1859,0.1977


##### Convert the time series data to a numpy array.

In [48]:
timeSeriesArray = timeSeriesData.to_numpy(dtype = np.float32)
type(timeSeriesArray)

numpy.ndarray

In [49]:
timeSeriesArray.shape

(9336, 15)

##### Convert the time series array to a tensor.

In [50]:
timeSeriesTensor = torch.from_numpy(timeSeriesArray)
type(timeSeriesTensor)

torch.Tensor

In [51]:
timeSeriesTensor.shape

torch.Size([9336, 15])

##### Reshape the tensor.

In [52]:
dailyAirQuality = timeSeriesTensor.view(-1, 24, 15)
dailyAirQuality = dailyAirQuality.transpose(1, 2)
dailyAirQuality.shape

torch.Size([389, 15, 24])

In [53]:
dailyAirQuality

tensor([[[ 1.1000e+01,  1.1000e+01,  1.1000e+01,  ...,  1.1000e+01,
           1.1000e+01,  1.1000e+01],
         [ 0.0000e+00,  1.0000e+00,  2.0000e+00,  ...,  2.1000e+01,
           2.2000e+01,  2.3000e+01],
         [ 4.5645e-01,  4.5387e-01,  4.5259e-01,  ...,  4.9119e-01,
           4.6031e-01,  4.5387e-01],
         ...,
         [ 3.5486e-02,  2.1612e-02,  2.1612e-02,  ..., -1.5384e-02,
          -3.6194e-02, -3.6194e-02],
         [ 3.3805e-01,  4.0047e-01,  3.9462e-01,  ...,  4.7850e-01,
           4.6679e-01,  4.1608e-01],
         [ 1.9517e-01,  1.9542e-01,  1.9529e-01,  ...,  1.9470e-01,
           1.9338e-01,  1.9275e-01]],

        [[ 1.2000e+01,  1.2000e+01,  1.2000e+01,  ...,  1.2000e+01,
           1.2000e+01,  1.2000e+01],
         [ 0.0000e+00,  1.0000e+00,  2.0000e+00,  ...,  2.1000e+01,
           2.2000e+01,  2.3000e+01],
         [ 4.6288e-01,  4.6546e-01,  4.5902e-01,  ...,  4.9763e-01,
           4.8605e-01,  5.1050e-01],
         ...,
         [-3.3882e-02, -4

$\hspace{1in}$

##### **Text Data**

##### Load a file with text data from Google Drive. The file was taken from Kaggle, and consists of [consumer reviews of Amazon products](https://www.kaggle.com/datasets/datafiniti/consumer-reviews-of-amazon-products).

In [54]:
consumerReviews = pd.read_csv(path + "consumerReviews.csv", dtype = "unicode")


##### Get a sample of 1,000 reviews and get a text sample.

In [55]:
textData = consumerReviews[["reviews.text"]]
textData = textData.iloc[:1000,]
textData

Unnamed: 0,reviews.text
0,This product so far has not disappointed. My c...
1,great for beginner or experienced person. Boug...
2,Inexpensive tablet for him to use and learn on...
3,I've had my Fire HD 8 two weeks now and I love...
4,I bought this for my grand daughter when she c...
...,...
995,Got it for my Dad. Easy to use and he loves it...
996,One of the best purchases or investments you c...
997,I bought it for kindle books and was amazed by...
998,"I bought this primarily for reading, but, bein..."


In [56]:
textSample = textData.iloc[0, 0]
textSample

'This product so far has not disappointed. My children love to use it and I like the ability to monitor control what content they see with ease.'

##### **Option 1: One-hot encoding characters**

##### Create a tensor for the one-hot-encoded characters.

In [57]:
textSampleTensor = torch.zeros(len(textSample), 128)

for i, character in enumerate(textSample.lower().strip()):

    characterIndex = ord(character) if ord(character) < 128 else 0
    textSampleTensor[i][characterIndex] = 1

textSampleTensor.shape

torch.Size([143, 128])

In [58]:
textSampleTensor

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

##### **Option 2: One-hot encoding words**

##### Define a function that takes a string as input and returns a list of the words in lowercase and in the order that they are written in the string.

In [59]:
def cleanWords(inputString):

    punctuation = '.,;:"!?”“_-'
    wordList = inputString.lower().replace("\n"," ").split()
    wordList = [word.strip(punctuation) for word in wordList]

    return wordList

##### Apply the function to the text sample.

In [60]:
wordsInTextSample = cleanWords(textSample)
wordsInTextSample

['this',
 'product',
 'so',
 'far',
 'has',
 'not',
 'disappointed',
 'my',
 'children',
 'love',
 'to',
 'use',
 'it',
 'and',
 'i',
 'like',
 'the',
 'ability',
 'to',
 'monitor',
 'control',
 'what',
 'content',
 'they',
 'see',
 'with',
 'ease']

##### Build a dictionary with words as keys and integers as values.

In [61]:
joinedTextData = " ".join(textData["reviews.text"])
sortedWordList = sorted(set(cleanWords(joinedTextData)))
wordToIndexDictionary = {word: i for (i, word) in enumerate(sortedWordList)}

##### Create a tensor for the one-hot-encoded words.

In [62]:
textSampleTensor = torch.zeros(len(wordsInTextSample), len(wordToIndexDictionary))

for i, word in enumerate(wordsInTextSample):

    wordIndex = wordToIndexDictionary[word]
    textSampleTensor[i][wordIndex] = 1

textSampleTensor.shape

torch.Size([27, 2826])

In [63]:
textSampleTensor

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])