# Exercise - Regression

## Imports

In [None]:
import (
    "io/ioutil"
    "fmt"
    "os"
    "image/color"
    "math"
    
    "github.com/kniren/gota/dataframe"
    "gonum.org/v1/plot"
    "gonum.org/v1/plot/plotter"
    "gonum.org/v1/plot/plotutil"
    "gonum.org/v1/plot/vg"
    "github.com/sajari/regression"
)

## Import the Data

In [None]:
// Open the data file.
f, err := os.Open("../data/baseball.heigh.and.weigh.csv")
if err != nil {
    fmt.Println(err)
}

// Read in the contents to a dataframe.
dataset := dataframe.ReadCSV(f)

// Close the file.
f.Close()

In [None]:
// Output a summary of the dataset to stdout.
fmt.Println(dataset)

## Split our data into training and test data

In [None]:
// Calculate the number of elements in each set.
// We will utilize and 80/20 split in this case.
trainingNum := (4 * dataset.Nrow()) / 5
testNum := dataset.Nrow() / 5
if trainingNum+testNum < dataset.Nrow() {
    trainingNum++
}

// Create the subset indices.
trainingIdx := make([]int, trainingNum)
testIdx := make([]int, testNum)

// Enumerate the training indices.
for i := 0; i < trainingNum; i++ {
    trainingIdx[i] = i
}

// Enumerate the test indices.
for i := 0; i < testNum; i++ {
    testIdx[i] = trainingNum + i
}

// Create the subset dataframes.
trainingDF := dataset.Subset(trainingIdx)
testDF := dataset.Subset(testIdx)

In [None]:
// Output the sizes of our training and test sets.
fmt.Println(trainingDF.Nrow())
fmt.Println(testDF.Nrow())

## Training our model on the training data

In [None]:
// In this case we are going to try and model our weight
// by height using github.com/sajari/regression.
var r regression.Regression
r.SetObserved("weight")
r.SetVar(0, "height")

// Extract our y values.
y := trainingDF.Col("Weight(pounds)").Float()

// Loop of records in the CSV, adding the training data to the regression value.
for i, x := range trainingDF.Col("Height(inches)").Float() {

    // Add these points to the regression value.
    r.Train(regression.DataPoint(y[i], []float64{x}))
}

// Train/fit the regression model.
r.Run()

// Output the trained model parameters.
fmt.Printf("\nRegression Formula:\n%v\n\n", r.Formula)

## Evaluating our model on the test data

In [None]:
// Extract the observed weights and test height values.
observed := testDF.Col("Weight(pounds)").Float()
testHeights := testDF.Col("Height(inches)").Float()

// Calculate the mean squared error.
var mSE float64
for idx, oVal := range observed {
    
    // Make our prediction.
    prediction, err := r.Predict([]float64{testHeights[idx]})
    if err != nil {
        fmt.Println(err)
    }
    
    mSE += math.Pow(oVal-prediction, 2) / float64(len(observed))
}

// Calcualte the RMSE.
rMSE := math.Sqrt(mSE)

// Output the result to stdout.
fmt.Printf("RMSE: %f", rMSE)