In [1]:
import numpy as np
import pandas as pd

In [2]:
"""
Import the dataset
"""
df = pd.read_csv('/content/diabetes.csv')

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
"""
Split the dataset with train and test
"""
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

"""
Import the train_test_split function from scikit-learn
"""
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,\
                                                    test_size = 0.20,\
                                                    random_state = 42,\
                                                    shuffle = True)

print('X_train shape is # {}'.format(X_train.shape),'\n')
print('X_test  shape is # {}'.format(X_test.shape),'\n')
print('y_train shape is # {}'.format(y_train.shape),'\n')
print('y_test  shape is # {}'.format(y_test.shape))

X_train shape is # (614, 8) 

X_test  shape is # (154, 8) 

y_train shape is # (614,) 

y_test  shape is # (154,)


In [4]:
"""
Import the DataLoader liabray from the torch
"""
import torch
from torch.utils.data import Dataset, DataLoader

"""
Create an object of DataLoader and pass some parameters.
**********
parameters:
1. dataset
2. batch_size
3. shuffle

**********
Needless to say that, dataset(parameter) will take the `list`
"""
try:
  data_loader = DataLoader(dataset = list(zip(X_train, y_train)), batch_size = 16, shuffle = True)
except Exception as e:
  print(e.with_traceback)
else:
  print('DataLoader works !!!')

DataLoader works !!!


In [5]:
"""
Play with data_loader object
"""
print("The batch size of the dataset is {}".format(data_loader.batch_size))
print("*"*100)

"""
load_data is an iterable, so we can iterate this using loop. It will return data and labels
"""
for X_batch, y_batch in data_loader:
  print(X_batch,'\n\n','*'*100, '\n\n', y_batch)
  print('*'*100)
  print('*'*100)
  print(X_batch.size(), y_batch.size())
  break

The batch size of the dataset is 16
****************************************************************************************************
tensor([[2.0000e+00, 1.2200e+02, 6.0000e+01, 1.8000e+01, 1.0600e+02, 2.9800e+01,
         7.1700e-01, 2.2000e+01],
        [2.0000e+00, 1.1400e+02, 6.8000e+01, 2.2000e+01, 0.0000e+00, 2.8700e+01,
         9.2000e-02, 2.5000e+01],
        [9.0000e+00, 1.5600e+02, 8.6000e+01, 0.0000e+00, 0.0000e+00, 2.4800e+01,
         2.3000e-01, 5.3000e+01],
        [0.0000e+00, 1.3800e+02, 6.0000e+01, 3.5000e+01, 1.6700e+02, 3.4600e+01,
         5.3400e-01, 2.1000e+01],
        [1.0000e+00, 1.7300e+02, 7.4000e+01, 0.0000e+00, 0.0000e+00, 3.6800e+01,
         8.8000e-02, 3.8000e+01],
        [1.0000e+00, 8.7000e+01, 7.8000e+01, 2.7000e+01, 3.2000e+01, 3.4600e+01,
         1.0100e-01, 2.2000e+01],
        [4.0000e+00, 1.2800e+02, 7.0000e+01, 0.0000e+00, 0.0000e+00, 3.4300e+01,
         3.0300e-01, 2.4000e+01],
        [1.0000e+00, 1.4400e+02, 8.2000e+01, 4.6000e+01, 1

In [10]:
"""
Display the data and label shape
"""
data, labels = next(iter(data_loader))
print(data.shape)
print(labels.shape)

torch.Size([16, 8])
torch.Size([16])


In [15]:
"""
Break this next(itet()) so that it can be understandable
"""
iter(data_loader)

<torch.utils.data.dataloader._SingleProcessDataLoaderIter at 0x7b10996afa90>

In [18]:
next(iter(data_loader))

"""
It returns us data and label - as a result, we will put those two separate variables(data and labels)
"""

[tensor([[7.0000e+00, 1.8100e+02, 8.4000e+01, 2.1000e+01, 1.9200e+02, 3.5900e+01,
          5.8600e-01, 5.1000e+01],
         [5.0000e+00, 1.2100e+02, 7.2000e+01, 2.3000e+01, 1.1200e+02, 2.6200e+01,
          2.4500e-01, 3.0000e+01],
         [1.0000e+00, 7.9000e+01, 8.0000e+01, 2.5000e+01, 3.7000e+01, 2.5400e+01,
          5.8300e-01, 2.2000e+01],
         [1.5000e+01, 1.3600e+02, 7.0000e+01, 3.2000e+01, 1.1000e+02, 3.7100e+01,
          1.5300e-01, 4.3000e+01],
         [8.0000e+00, 1.8800e+02, 7.8000e+01, 0.0000e+00, 0.0000e+00, 4.7900e+01,
          1.3700e-01, 4.3000e+01],
         [0.0000e+00, 1.0500e+02, 8.4000e+01, 0.0000e+00, 0.0000e+00, 2.7900e+01,
          7.4100e-01, 6.2000e+01],
         [1.0000e+00, 1.0200e+02, 7.4000e+01, 0.0000e+00, 0.0000e+00, 3.9500e+01,
          2.9300e-01, 4.2000e+01],
         [3.0000e+00, 1.2800e+02, 7.2000e+01, 2.5000e+01, 1.9000e+02, 3.2400e+01,
          5.4900e-01, 2.7000e+01],
         [1.0000e+01, 1.6100e+02, 6.8000e+01, 2.3000e+01, 1.3200

####Remarks:

 `DataLoader` is normally used for creating the `batch size` of the data.

 When we will pass the data for the `training` during that time we will have to pass this batch size of data then we will upadte the weights and bias

#####Practise Session

In [19]:
train_dataset = pd.read_csv('/content/sample_data/california_housing_train.csv')

train_dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [20]:
test_dataset = pd.read_csv('/content/sample_data/california_housing_train.csv')

test_dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [23]:
print('Train dataset shape is # {}'.format(train_dataset.shape))
print('Test dataset shape is  # {}'.format(test_dataset.shape))

Train dataset shape is # (17000, 9)
Test dataset shape is  # (17000, 9)


In [26]:
"""
Append or concat the dataset(training and testing)
"""
df = pd.concat([train_dataset, test_dataset], axis = 0)

df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [27]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values


X_train, X_test, y_train, y_test = train_test_split(X, y,\
                                                    test_size = 0.30,\
                                                    random_state = 42,\
                                                    shuffle = True)

print('X_train shape is # {}'.format(X_train.shape),'\n')
print('X_test  shape is # {}'.format(X_test.shape),'\n')
print('y_train shape is # {}'.format(y_train.shape),'\n')
print('y_test  shape is # {}'.format(y_test.shape))

X_train shape is # (23800, 8) 

X_test  shape is # (10200, 8) 

y_train shape is # (23800,) 

y_test  shape is # (10200,)


In [36]:
import torch
from torch.utils.data import Dataset, DataLoader

loader = DataLoader(dataset = list(zip(X_train, y_train)),\
                    batch_size = 32,\
                    shuffle = True)

data, labels = next(iter(loader))
print("The shape of one singular X_train batch is # {}".format(data.size()))
print("The shape of one singular labels batch is # {}".format(labels.size()))

The shape of one singular X_train batch is # torch.Size([32, 8])
The shape of one singular labels batch is # torch.Size([32])


In [47]:
"""
We can do without train and test
"""

loader = DataLoader(dataset = list(df.values), batch_size = 32, shuffle = True)
data, labels = next(iter(loader))
print("The shape of one singular X_train batch is # {}".format(data.size()))
print("The shape of one singular labels batch is # {}".format(labels.size()))

ValueError: ignored

In [48]:
next(iter(loader))

tensor([[-1.2219e+02,  3.7820e+01,  3.2000e+01,  1.8350e+03,  2.6400e+02,
          6.3500e+02,  2.6300e+02,  8.3170e+00,  3.6590e+05],
        [-1.1728e+02,  3.3850e+01,  1.6000e+01,  3.4980e+03,  7.0200e+02,
          2.3720e+03,  6.7200e+02,  2.3229e+00,  1.1800e+05],
        [-1.1779e+02,  3.3770e+01,  2.1000e+01,  4.3490e+03,  5.5300e+02,
          1.6800e+03,  5.1900e+02,  6.9014e+00,  4.3900e+05],
        [-1.2095e+02,  3.6470e+01,  5.2000e+01,  1.6910e+03,  3.0100e+02,
          6.1800e+02,  2.3900e+02,  3.2292e+00,  2.2500e+05],
        [-1.2185e+02,  3.6590e+01,  4.2000e+01,  8.9100e+02,  2.0300e+02,
          5.2500e+02,  2.1200e+02,  3.3156e+00,  1.8630e+05],
        [-1.2207e+02,  3.7390e+01,  3.7000e+01,  1.1690e+03,  2.3900e+02,
          5.8900e+02,  2.4900e+02,  5.0131e+00,  3.3030e+05],
        [-1.2020e+02,  3.7800e+01,  3.0000e+01,  1.1890e+03,  2.5500e+02,
          4.4600e+02,  1.6500e+02,  3.4838e+00,  1.1250e+05],
        [-1.1793e+02,  3.3950e+01,  3.1000e+01, 

In [None]:
"""
What did we learn ?

# We can pass the whole dataset into DataLoader(), but when we will try to separte it with `data` & `labels` then it will give me error.
Because, It does not have the power to understand which is Independent & Dependent Features.
As a result, in `dataset` parametes we will must pass the `list` with `Inpdependent & Dependent` Features with `Zip` format
"""