# Exploring DiffSharp: Neural Networks

In [None]:
#r "nuget: DiffSharp-lite, 1.0.1"

Formatter.SetPreferredMimeTypesFor(typeof<obj>, "text/plain")
Formatter.Register(fun x writer -> fprintfn writer "%120A" x )

open DiffSharp
open DiffSharp.Util // required for print and other stuff
open DiffSharp.Model
open System.Diagnostics   // not required
open System               // not required

dsharp.config(dtype=Dtype.Float32, device=Device.CPU)

Microsoft.DotNet.Interactive.InstallPackagesMessage


## Defining Neural Network Layers
The below cell defines a Linear Layer where the bias can be turned off by setting **bias** to *false*

In [None]:
type LinearB(inFeatures, outFeatures, ?bias: bool) =
    inherit Model()
    let bias = defaultArg bias true

    let w = Parameter(Weight.kaiming(inFeatures, outFeatures))  // defining a parameter and its initialization function
    let k = 1./sqrt (float outFeatures)
    let b = Parameter(Weight.uniform([|outFeatures|], k))
    do base.addParameter([w;b],["Linear-weight";"Linear-bias"])

    override _.ToString() = sprintf "Linear(%A, %A)" inFeatures outFeatures
    
    override _.forward(value) =
        if bias then dsharp.matmul(value, w.value) + b.value else dsharp.matmul(value, w.value) + 0. * b.value

## Defining an initialization function
Next we define a Linear Layer that allows both, biases to be turned of as well as a custom initialization function
and then we create a new initialization function.

Most predefined Layers do not have the option to specify an initialization function, but with a few changes they can.

In [None]:
type LinearBI(inFeatures, outFeatures, ?bias: bool, ?initFun) =
    inherit Model()
    let bias = defaultArg bias true
    let initFun = defaultArg initFun Weight.kaiming

    let w = Parameter(initFun(inFeatures, outFeatures))  // defining a parameter and its initialization function
    let k = 1./sqrt (float outFeatures)
    let b = Parameter(Weight.uniform([|outFeatures|], k))
    do base.addParameter([w;b],["Linear-weight";"Linear-bias"])

    override _.ToString() = sprintf "Linear(%A, %A)" inFeatures outFeatures
    
    override _.forward(value) =
        if bias then dsharp.matmul(value, w.value) + b.value else dsharp.matmul(value, w.value) + 0. * b.value


type InitLayer =                                            // this initialization function makes no sense it is just intended as an example
    static member initfunx(fanIn, fanOut, ?a: float) = 
        let a = defaultArg a (3.141528)
        dsharp.randint(-3, 23, [fanIn; fanOut]) * a 

In [None]:
dsharp.seed(23)

// creating two models with a custom init function
let model_custom = 
    LinearBI(1, 10, false, InitLayer.initfunx)
    --> dsharp.relu
    --> Linear(10, 1)

dsharp.seed(23)
let model_default = 
    LinearBI(1, 10, false)
    --> dsharp.relu
    --> Linear(10, 1)



let test_data = dsharp.tensor([1., 2., 3.])
print $"Custom init function:\n{test_data.view([-1; 1]) --> model_custom}\n------------------------------------------------------"
print $"default init function:\n{test_data.view([-1; 1]) --> model_default}\n------------------------------------------------------"


// if the initialization function takes more arguments or different arguments 
// e.g. a shape like [|inFeatures, outFeatures|] creating a function like this works:
let initfunx2(fanIn, fanOut) = 
    InitLayer.initfunx(fanIn, fanOut, 3.141528)   
    // 3.141528 is the default value of the init function, therefore the result below will be identical to the first one
    // change the value to obtain different results

dsharp.seed(23)
let model3 = 
    LinearBI(1, 10, false, initfunx2)
    --> dsharp.relu
    --> Linear(10, 1)

print $"Custom init function wrapped by another function:\n{test_data.view([-1; 1]) --> model3}"

"Custom init function:
tensor([[-14.2258],
        [-29.3985],
        [-44.5711]])
------------------------------------------------------"
"default init function:
tensor([[-0.6649],
        [-0.6827],
        [-0.7005]])
------------------------------------------------------"
"Custom init function wrapped by another function:
tensor([[-14.2258],
        [-29.3985],
        [-44.5711]])"


## Handling Datasets
DiffSharp offers a few ways to handle the most common types of Datasets:
- Images
- Text
- Tensors (e.g. time series)

DiffSharp.Data contains the necessary utilities, TensorDataset, ImageDataset, TextDataset.  
Furthermore there's a few well known Datasets available: CIFAR10, CIFAR100, MNIST

In [None]:
open DiffSharp.Data

let x_data = (dsharp.rand([256]) - 0.5) * 9

// a polynomial of degree 4
let poly(x:  Tensor) = 
    x * x * x * x + x * x * x - 11 * x * x - 5 * x + 30

let y_data = poly(x_data)
let epochs = 4000

// TensorDataset takes the input data as a tensor and the labels/target as a tensor
let ds = TensorDataset(x_data, y_data)

ds.Display()
print(ds.GetHashCode())
print(ds.ToString())
print(ds.ToDisplayString())
print(ds.length)    // get the length of the dataset
print(ds.Item(0))   // get specific items
print(ds.GetType()) // get the type of the dataset e.g. TensorDataSet, ImageDataset ...
print(ds.filter(fun x y -> (float x) > 1.))  
// filters and returns the dataset where x in this case corresponds to the values from 
// x_data and y corresponds to the values from y_data
print(ds.Equals(ds))
print(ds.loader(2, true, false))  // returns a DataLoader, specified by the options below
// available options:
//   batchSize    : int  *
//   shuffle      : option<bool>  *
//   dropLast     : option<bool>  *
//   device       : option<Device>  *   // typically the dataset is on the CPU
//   dtype        : option<Dtype>  *
//   backend      : option<Backend>  *
//   targetDevice : option<Device>  *   // device where the actual training is done
//   targetDtype  : option<Dtype>  *
//   targetBackend: option<Backend> 

// here's the train loader that is used to train the NN in the next chapter
let batchsize = 128
let trainLoader = ds.loader(batchsize)

Dataset(256)


39993852
"Dataset(256)"
"Dataset(256)
"
256
(tensor(0.9634), tensor(16.7286))
DiffSharp.Data.TensorDataset
Dataset(98)
true
DiffSharp.Data.DataLoader


### The DataLoader
These are the most important functions related to the DataLoader

In [None]:
print(trainLoader.batch()) // returns a single batch of inputs and targets of the previously
                           // selected size (can optionally be specified to a different value)

print(trainLoader.epoch()) // trainLoader.epoch() returns a sequence of the batch number
                           // the inputs and the targets of the batch
                           
print(trainLoader.length)  // returns the number of batches

(tensor([ 0.9634, -3.4970, -4.0286, ..., -3.2670,  3.9466,  3.9709]),
 tensor([ 16.7286,  19.7514,  69.6332, ...,   7.9779,  143.0090,  147.9452]))
seq
  [(0, tensor([ 0.9634, -3.4970, -4.0286, ..., -3.2670,  3.9466,  3.9709]),
    tensor([ 16.7286,  19.7514,  69.6332, ...,   7.9779,  143.0090,  147.9452]));
   (1, tensor([-0.3838,  4.0720,  3.5992, ..., -0.3668,  2.6469, -2.9336]),
    tensor([ 30.2640,  169.7124,  83.9454, ...,  30.3228,   7.3297,  -1.1810]))]
2


## Building and training Neural Networks
Several loss functions are included by default:
- dsharp.nllLoss
- dsharp.crossEntropyLoss
- dsharp.mseLoss
- dsharp.bceLoss

And there also are quite a few [layers](https://diffsharp.github.io/reference/diffsharp-model.html) available.

When it comes to optimizers there's only two:
- SGD
- Adam

but it is possible to create your own.

In [None]:
open DiffSharp.Optim
open DiffSharp.Compose

// defining a custom loss function 
let absolute_loss(input, target) = 
    dsharp.sum(abs(input - target))

// models can be very conveniently composed like this:
let model = 
    LinearB(1, 36)
    --> dsharp.leakyRelu(0.1)
    --> LinearB(36, 10)
    --> dsharp.relu
    --> LinearB(10, 1, false)

// defining several optimizers to imitate a learn rate schedule
let optimizer = Adam(model, dsharp.tensor(0.0075))
let optimizer2 = Adam(model, dsharp.tensor(0.002))
let optimizer3 = Adam(model, dsharp.tensor(0.0005))

for epoch = 1 to epochs do
    let batches = trainLoader.epoch()
    for i, input, target in batches do
        model.reverseDiff()

        let output = input.view([-1; 1]) --> model

        // mean squared error loss
        //let msel = dsharp.mseLoss(output, target.unsqueeze(1))    
        //msel.reverse()

        // absolute loss
        let abserr = absolute_loss(output, target.unsqueeze(1))
        abserr.reverse()

        if epoch % 100 = 0 then
            print(abserr)           // prints the errror every 100 epochs

        // since theres no learnrate scheduling, here is a naive way to
        // implement it:
        if epoch <= 1000 then
            optimizer.step()
        if epoch > 1000  && epoch <= 3000 then
            optimizer2.step()
        if epoch > 3000 then
            optimizer3.step()

    if epoch % 500 = 0 then
        print $"Epoch: {epoch}"

tensor(3825.5364):rev
tensor(2993.2634):rev
tensor(463.0147):rev
tensor(347.4586):rev
tensor(171.7870):rev
tensor(143.2018):rev
tensor(124.1993):rev
tensor(109.6401):rev
tensor(140.3423):rev
tensor(103.1965):rev
"Epoch: 500"
tensor(131.7450):rev
tensor(70.1396):rev
tensor(101.1958):rev
tensor(90.9501):rev
tensor(103.9215):rev
tensor(82.2672):rev
tensor(90.5190):rev
tensor(81.5198):rev
tensor(79.3801):rev
tensor(71.3603):rev
"Epoch: 1000"
tensor(73.0621):rev
tensor(62.0746):rev
tensor(72.1376):rev
tensor(62.4026):rev
tensor(71.6132):rev
tensor(64.1625):rev
tensor(71.9583):rev
tensor(62.5870):rev
tensor(75.8764):rev
tensor(74.1071):rev
"Epoch: 1500"
tensor(70.1039):rev
tensor(63.3726):rev
tensor(71.0166):rev
tensor(62.7544):rev
tensor(67.1107):rev
tensor(64.0922):rev
tensor(67.3875):rev
tensor(64.8878):rev
tensor(68.4382):rev
tensor(60.8917):rev
"Epoch: 2000"
tensor(67.3343):rev
tensor(61.7994):rev
tensor(66.0873):rev
tensor(63.2342):rev
tensor(67.7834):rev
tensor(62.1462):rev
tensor(65.

More examples, for example a RNN, GAN, VAE, VAE-CNN and classifier can be found at the official DiffSharp Github.

## Plotting Results
DiffSharp offers a few simple options to create plots. (open DiffSharp.Util)  
You can create line plots and histograms and save them to files.

In [None]:
let x_order = dsharp.arange(6.1, 0, 0.1) - 3 
let y_order = poly(x_order)

let res = x_order.view([-1; 1]) --> model

In [None]:
let plt = Pyplot("/home/martin/anaconda3/bin/python")     // insert path to python binary here
plt.plot(x_order, y_order, label="true fun")              // actual function that was approximated
plt.plot(x_order, res.squeeze(), label="approx fun")      // approximation of the function
plt.plot(x_order, y_order - res.squeeze(), label="err")   // error line
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.tightLayout()
plt.savefig("true_fun_vs_approx_fun_ipy")

In [None]:
let plt = Pyplot("/home/martin/anaconda3/bin/python")     
plt.plot(x_order, y_order - res.squeeze(), label="err")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.tightLayout()
plt.savefig("Error_ipy")

Line plot with only y data:

In [None]:
let plt = Pyplot("/home/martin/anaconda3/bin/python")     
plt.plot(y_order, label="Some metric/s") // if x isn't specified x starts at 0 and increases by 1
plt.xlabel("s")                          // up to the total number of points - 1 
plt.ylabel("Some metric")
plt.legend()
plt.savefig("line_example_plot_ipy")

Plotting a histogram:

In [None]:
let plt = Pyplot("/home/martin/anaconda3/bin/python")     
plt.hist(x_order, y_order, bins=10, density=false, label="Some metric/s") 
plt.xlabel("s")
plt.ylabel("Some metric")
plt.legend()
plt.savefig("hist_example_plot_ipy")

// options for hist
// x : Tensor
// ?weights : Tensor
// ?bins : int
// ?density : bool
// ?label : string 