# Problem introduction, exploratory development

## Imports

In [None]:
import (
    "fmt"
    "os"
    "io/ioutil"
    "image/color"
    "path"
    "bufio"
    "math"

    "github.com/kniren/gota/dataframe"
    "gonum.org/v1/gonum/floats"
    "gonum.org/v1/gonum/stat"
    "gonum.org/v1/plot"
    "gonum.org/v1/plot/plotter"
    "gonum.org/v1/plot/vg"
    "github.com/sajari/regression"
)

## Convenience Functions

In [None]:
// GetGraph returns the bytes corresponding to a
// saved plot.
func GetGraph(graphName string) ([]byte, error) {
    
    // Open the file.
    infile, err := os.Open(graphName)
    if err != nil {
        return nil, err
    }
    
    // Read in the contents of the file.
    bytes, err := ioutil.ReadAll(infile)
    if err != nil {
        return nil, err
    }
    
    // Close the file.
    infile.Close()
    
    return bytes, err
}

## Import the Data

In [None]:
// Open the diabetes dataset file.
f, err := os.Open("../../data/diabetes.csv")
if err != nil {
    fmt.Println(err)
}

// EXERCISE - Create a dataframe called diabetesDF from the CSV file.

// Close the file.
f.Close()

// EXERCISE - Take a look at the parsed data by printing the dataframe.

## Profile the data

### Distributions and summary statistics

In [None]:
// Create a histogram for each of the columns in the dataset and
// output summary statistics.
for _, colName := range diabetesDF.Names() {

    // Create a plotter.Values value and fill it with the
    // values from the respective column of the dataframe.
    plotVals := make(plotter.Values, diabetesDF.Nrow())
    summaryVals := make([]float64, diabetesDF.Nrow())
    for i, floatVal := range diabetesDF.Col(colName).Float() {
        plotVals[i] = floatVal
        summaryVals[i] = floatVal
    }

    // Make a plot and set its title.
    p, err := plot.New()
    if err != nil {
        fmt.Println(err)
    }
    p.Title.Text = fmt.Sprintf("Histogram of a %s", colName)

    // EXERCISE - Create a histogram of our values.
    // Normalize the histogram.
    // Add the histogram to the plot.

    // Save the plot to a PNG file.
    if err := p.Save(4*vg.Inch, 4*vg.Inch, colName+"_hist.png"); err != nil {
        fmt.Println(err)
    }

    // Calculate the summary statistics.
    meanVal := stat.Mean(summaryVals, nil)
    maxVal := floats.Max(summaryVals)
    minVal := floats.Min(summaryVals)
    stdDevVal := stat.StdDev(summaryVals, nil)

    // Output the summary statistics.
    fmt.Printf("\n%s\n", colName)
    fmt.Printf("Mean: %0.2f\n", meanVal)
    fmt.Printf("Min: %0.2f\n", minVal)
    fmt.Printf("Max: %0.2f\n", maxVal)
    fmt.Printf("StdDev: %0.2f\n\n", stdDevVal)
}

In [None]:
// EXERCISE - Read the plot data from one of the histograms.
// Display the plot.

### Correlations

In [None]:
// Extract the target column.
yVals := diabetesDF.Col("y").Float()

// Create a scatter plot for each of the features in the dataset.
for _, colName := range diabetesDF.Names() {

    // pts will hold the values for plotting
    pts := make(plotter.XYs, diabetesDF.Nrow())

    // EXERCISE - Fill pts with data.

    // Create the plot.
    p, err := plot.New()
    if err != nil {
        fmt.Println(err)
    }
    p.X.Label.Text = colName
    p.Y.Label.Text = "y"
    p.Add(plotter.NewGrid())

    s, err := plotter.NewScatter(pts)
    if err != nil {
        fmt.Println(err)
    }
    s.GlyphStyle.Color = color.RGBA{R: 255, B: 128, A: 255}
    s.GlyphStyle.Radius = vg.Points(3)

    // Save the plot to a PNG file.
    p.Add(s)
    if err := p.Save(4*vg.Inch, 4*vg.Inch, colName+"_scatter.png"); err != nil {
        fmt.Println(err)
    }
}

In [None]:
// Read the plot data from one of the scatter plots.
plotBytes, err := GetGraph("bmi_scatter.png")
if err != nil {
    fmt.Println(err)
}
    
// Display the plot.
display.PNG(plotBytes)

## Split our data into training, test, and holdout sets 

In [None]:
// Calculate the number of elements in each set.
trainingNum := (3*diabetesDF.Nrow()) / 5
testNum := diabetesDF.Nrow() / 5
holdoutNum := diabetesDF.Nrow() / 5
if trainingNum+testNum+holdoutNum < diabetesDF.Nrow() {
    trainingNum++
}

// Create the subset indices.
trainingIdx := make([]int, trainingNum)
testIdx := make([]int, testNum)
holdoutIdx := make([]int, holdoutNum)

// Enumerate the training indices.
for i := 0; i < trainingNum; i++ {
    trainingIdx[i] = i
}

// Enumerate the test indices.
for i := 0; i < testNum; i++ {
    testIdx[i] = trainingNum + i
}

// Enumerate the holdout indices.
for i := 0; i < holdoutNum; i++ {
    holdoutIdx[i] = trainingNum + testNum + i
}

// Create the subset dataframes.
trainingDF := diabetesDF.Subset(trainingIdx)
testDF := diabetesDF.Subset(testIdx)
holdoutDF := diabetesDF.Subset(holdoutIdx)

// Create a map that will be used in writing the data
// to files.
setMap := map[int]dataframe.DataFrame{
    0: trainingDF,
    1: testDF,
    2: holdoutDF,
}

// Create the respective files.
for idx, setName := range []string{"training.csv", "test.csv", "holdout.csv"} {

    // Save the filtered dataset file.
    f, err := os.Create(path.Join("data/", setName))
    if err != nil {
        fmt.Println(err)
    }

    // Create a buffered writer.
    w := bufio.NewWriter(f)

    // Write the dataframe out as a CSV.
    if err := setMap[idx].WriteCSV(w); err != nil {
        fmt.Println(err)
    }
}

## Train our model

In [None]:
// Extract the response column.
yVals := setMap[0].Col("y").Float()

// EXERCISE - Extract the feature column(s) you want to use in your model
// into slice(s) of floats.

// EXERCISE - Train a linear regression model using your extracted 
// features and reponse. You can use gonum for this as was previously
// demonstrated, or you could use something like github.com/sajari/regression,
// which adds a bit a convenience.
//
// HINT - You can try using one or more than one feature in your model.
// (start with bmi and then maybe add another that you think is correlated
// with y, based on the above generated scatter plots)

// EXERCISE - Output the trained model parameters to stdout.

## Test the model

In [None]:
// Extract the response column for testing.
yValsTest := setMap[0].Col("y").Float()

// Extract the feature column(s) we want to use for testing.
bmiValsTest := setMap[0].Col("bmi").Float()
ltgValsTest := setMap[0].Col("ltg").Float()

// EXERCISE - Loop over the test data predicting y with your model
// and evaluating the prediction with the RMSE.

// EXERCISE - Output the RMSE to standard out.