# Linear Regression

### 1. import libraries

In [None]:

import "fmt"
import "gonum.org/v1/gonum/blas/blas64"
import "gonum.org/v1/gonum/mat"
import "gonum.org/v1/gonum/floats"
import "gonum.org/v1/gonum/stat"
import "gonum.org/v1/plot"
import "gonum.org/v1/plot/plotter"
import "gonum.org/v1/plot/plotutil"
import "gonum.org/v1/plot/vg"
//import "github.com/montanaflynn/stats" // be careful, it is different from gonum's stat
import "strings"
import "strconv"
import "github.com/kniren/gota/dataframe"
import "os"
import "encoding/csv"
import "github.com/kniren/gota/series"
import "math"
import "image/color"
import "io/ioutil"

In [None]:

func GetGraph(graphName string) ([]byte, error) { //utility to load graphs (to use later)
	infile, err := os.Open(graphName)
	bytes, err := ioutil.ReadAll(infile)
	infile.Close()
	return bytes, err
}

#### 2. Load working csv file

In [None]:
file, err := os.Open("../data/basketball.heigh.and.weigh.csv")
if err != nil {
    fmt.Println(err)
}

In [None]:
dataset := dataframe.ReadCSV(file)

In [None]:
file.Close()

In [None]:
fmt.Println(dataset)

### We can select one or many columns of a dataFrame :

In [None]:
fmt.Println(dataset.Col("Team").Subset([]int{1,2,3,4,5}))

In [None]:
fmt.Println(dataset.Select([]string{"Name","Position"}))

##### select name, position and heigt for the Chicago White Sox 

In [None]:
filter := dataframe.F{
    Colname:    "Team",
    Comparator: series.Eq,
    Comparando: "CWS",
}

In [None]:
fmt.Println(dataset.Filter(filter).Select([]string{"Name","Position","Weight(pounds)"}))

### Find the average width and heigh by team

find all teams (no duplicated)

In [None]:
teams :=dataset.Col("Team").Records()

In [None]:
teamsMap := make(map[string]string)
for _,val := range teams {//build a map with team names
    teamsMap[val]=val
}

In [None]:
fmt.Println(teamsMap)

### Measures of central tendency

For each team, find the average (the mean) and mode for width and height

In [None]:
fmt.Printf("%s \t %s \t \t %s \n","key","wMean","hMean" )
for key :=range teamsMap{
    // create a filter over the datafrate using the key map to filter data for team
    filter := dataframe.F{
        Colname:    "Team",
        Comparator: series.Eq,
        Comparando: key,
    }
    // get array with weights and heights information using the filter (for team)
    keyWeights:= dataset.Filter(filter).Col("Weight(pounds)").Float()
    keyHeights:= dataset.Filter(filter).Col("Height(inches)").Float()
    
    // calculate data
    wMean := stat.Mean(keyWeights,nil)
    hMean := stat.Mean(keyHeights,nil)
    wMode := stat.Mode(keyWeights, nil)
    hMode := stat.Mode(keyHeights, nil)
    fmt.Printf("%s \t %f \t %f \t %f \t %f \n", key, wMean, hMean, wMode,hMode )
}

### Measures of spread or dispersion

Let's focalize in the weight, calculate the maximun, minimun, Range, Variance and Standar Deviation for each team

In [None]:
for key :=range teamsMap{
    // create a filter over the datafrate using the key map to filter data for team
    filter := dataframe.F{
        Colname:    "Team",
        Comparator: series.Eq,
        Comparando: key,
    }
    // get array, this time only with weights information using the filter (for team)
    keyWeights:= dataset.Filter(filter).Col("Weight(pounds)").Float()

    //calculate metrics
    minVal := floats.Min(keyWeights)
    maxVal := floats.Max(keyWeights)
    varianceVal := stat.Variance(keyWeights, nil)
    stdDevVal := stat.StdDev(keyWeights, nil)

    fmt.Printf("%s \t %f \t %f \t %f \t %f \n", key, minVal, maxVal, varianceVal, stdDevVal)
}

#### Visualize the relationship between width and heigh

In [None]:
plotGrid, err := plot.New()
if err != nil {
    fmt.Println(err)
}

In [None]:
plotGrid.Title.Text = "relationship between width and heigh"
plotGrid.Y.Label.Text = "Weight"
plotGrid.X.Label.Text = "Height"

In [None]:
plotGrid.Add(plotter.NewGrid())

In [None]:

widthsHeighs := dataset.Select([]string{"Height(inches)","Weight(pounds)"})


In [None]:
fmt.Println(widthsHeighs)

In [None]:
ptsPlot := make(plotter.XYs, widthsHeighs.Nrow())
for i,val := range ptsPlot { // builds points to add to our graph
    ptsPlot[i].X = widthsHeighs.Elem(i,0).Float()
    ptsPlot[i].Y = widthsHeighs.Elem(i,1).Float()
    if widthsHeighs.Elem(i,0).IsNA() {
        ptsPlot[i].X = 0
    }
    if widthsHeighs.Elem(i,1).IsNA() {
        ptsPlot[i].Y = 0
    }
}

In [None]:
s, err := plotter.NewScatter(ptsPlot)
if err != nil {
    fmt.Println(err)
}

In [None]:
s.GlyphStyle.Color = color.RGBA{R: 255, B: 128, A: 255}

In [None]:
plotGrid.Add(s)

In [None]:
if err := plotGrid.Save(4*vg.Inch, 4*vg.Inch, "wvsh1.png"); err != nil {
	fmt.Println(err)
}


In [None]:
graph1, err :=GetGraph("wvsh1.png")
if err != nil {
    fmt.Println(err)
}

In [None]:
display.PNG(graph1)

#### Visualize the distribution of the weights for all teams

In [None]:
for key :=range teamsMap{
    v := make(plotter.Values, widthsHeighs.Nrow())
    // create a filter over the datafrate using the key map to filter data for team
    filter := dataframe.F{
        Colname:    "Team",
        Comparator: series.Eq,
        Comparando: key,
    }
    // get array, this time only with weights information using the filter (for team)
    v = dataset.Filter(filter).Col("Weight(pounds)").Float()
    // Make a plot and set its title.
    p, err := plot.New()
    if err != nil {
        fmt.Println(err)
    }
    p.Title.Text = fmt.Sprintf("Histogram of a %s", key)
    // Create a histogram of our values drawn
    // from the standard normal.
    h, err := plotter.NewHist(v, 16)
    if err != nil {
        fmt.Println(err)
    }
    // Normalize the histogram.
    h.Normalize(1)
    // Add the histogram to the plot.
    p.Add(h)
    if err := p.Save(4*vg.Inch, 4*vg.Inch, key+"_hist.png"); err !=nil {
        fmt.Println(err)
    }
}

In [None]:
graph, err :=GetGraph("BAL_hist.png")
display.PNG(graph)

Find the linear regression of this data. The idea is to find the best-fit line: y = alpha + beta*x

In [None]:
alpha, beta := stat.LinearRegression(widthsHeighs.Col("Height(inches)").Float(),widthsHeighs.Col("Weight(pounds)").Float(), nil, false)

In [None]:
fmt.Printf("y = %f + %f * x",alpha,beta) 