In [1]:
// To train the model in this example we'll use a data dump from the guardian
// which is contained in ./guardian
// First of all we will read the text files and create a corpus

In [2]:
import "io/ioutil"
files, _ := ioutil.ReadDir("./guardian")

In [3]:
var trainingData = make([]string, len(files))

In [4]:
import "fmt"

// Copy all of the files into the corpus
for i, file := range files {
    b, _ := ioutil.ReadFile(fmt.Sprintf("./guardian/%s", file.Name()))
    trainingData[i] = string(b)
}

In [5]:
// Now we have the training data we need to get our product data which is stored in CSV format
// For simplicity we will create a struct to unmarshal some of the data into

In [6]:
type Bet struct {
    ID          string `csv:"ID"`
    EventName   string `csv:"EventName"`
    Sport       string `csv:"Sport"`
    RawType     string `csv:"RawType"`
    Competition string `csv:"Competition"`
    Home        string `csv:"Home"`
    Away        string `csv:"Home"`
    Options     string `csv:"Options"`
}

// Each bet needs to be represented as a single line of the the corpus so write a quite helper function
func (b Bet) ToString() string {
    return fmt.Sprintf(
        "%s %s %s %s %s %s %s",
        b.EventName,
        b.Sport,
        b.RawType,
        b.Competition,
        b.Home,
        b.Away,
        b.Options,
    )
}

In [7]:
import (
    "os"
    
    "github.com/gocarina/gocsv"
)

// marshal our csv
betsFile, _ := os.OpenFile("bets.csv", os.O_RDWR|os.O_CREATE, os.ModePerm)

var bets []*Bet

gocsv.UnmarshalFile(betsFile, &bets)

betsFile.Close()

<nil>


In [8]:
// create our product (bet) corpus
var productCorpus = make([]string, len(bets))

for i, bet := range bets{
    productCorpus[i] = bet.ToString()
}

In [9]:
// Now all the data is gathered we can start building our pipelines.
// The first one will be a TDIDF which takes place after a count vectoriser

In [10]:
import "github.com/james-bowman/nlp"

var tfIdfPipeline = nlp.NewPipeline(
    nlp.NewCountVectoriser(),
    nlp.NewTfidfTransformer(),
)

In [11]:
// Fit the pipeline with the training data
tfIdfVectorisor := tfIdfPipeline.Fit(trainingData...)

Sports fans back governing bodies' support for marriage equality, study shows

Survey of dedicated fans finds 59%!a(MISSING)gree major codes were right to back the yes campaign in the same-sex marriage survey debate

The AFL temporarily changed the logo on its headquarters to show support for marriage equality. A survey of fans has found most supported governing bodies making a public stance.

Photograph: Paul Rovere/Getty Images

Most dedicated sports fans agree that governing bodies were right to support the marriage equality campaign, a survey has found.

The study by Monash University’s behavioural science laboratory and YouGov found 59%!o(MISSING)f those heavily engaged in sport (dubbed “superfans”) approved of sporting organisations’ involvement in the same-sex marriage survey debate – in which the AFL, NRL, ARU, FFA and Cricket Australia all came out in support of marriage equality.

The position of the sporting bodies during the campaign faced a chorus of public criticism, incl

In [18]:
// Transform the Product Corpus
tfIdfMatrix, _ := tfIdfVectorisor.Transform(productCorpus...)

In [13]:
// Repeat the steps for LDA. Create pipeline, Fit then Transform.
// For the LDA you define the number of topics which can be tuned.

In [14]:
var ldaTopics = 100

In [15]:
var ldaPipeline = nlp.NewPipeline(
    nlp.NewCountVectoriser(),
    nlp.NewLatentDirichletAllocation(ldaTopics),
)

In [16]:
// Fit the pipeline with the training data
ldafVectorisor := ldaPipeline.Fit(trainingData...)

In [19]:
ldaMatrix, _ := ldafVectorisor.Transform(productCorpus...)

In [20]:
import (
    "github.com/james-bowman/sparse"
    "github.com/james-bowman/nlp/measures/pairwise"
)

In [21]:
sparseTfIdf := tfIdfMatrix.(*sparse.CSR).ToCSC()

In [44]:
_, numDocs := sparseTfIdf.Dims()

In [23]:
var doc = `Yaya Toure's agent has posted a cryptic message on Twitter claiming the Ivory Coast international has passed a medical in London.

Toure's deal at the Etihad expired earlier this summer as he ended his eight-year affair with the club in style - winning his third Premier League title in record-breaking fashion.

The former Barcelona midfielder is yet to announce where he will be moving next, but his agent dropped a big hint on Tuesday evening suggesting news of his future is close.

He confirmed the player is not moving to West Ham or Crystal Palace in follow-up posts.

West Ham boss Mauricio Pellegrini had already ruled out a move for the free agent, who he won the Premier League with while in charge of Manchester City.


The East End club, according to reports, had previously considered making a move for the 35-year-old.

In May, Toure's agent told Sky Sports News his client was keen to remain in the Premier League after leaving City, revealing he would be willing pay back half his wages if his new club were not satisfied with his performances.

"Yaya is fit and desperate to prove he's still the best central midfielder in the Premier League," Seluk said.

"Ivory Coast will not be in Russia for the World Cup and next season he will be fit, rested and stronger than ever.
`

In [31]:
query, _ := tfIdfVectorisor.Transform(doc)
sparseQuery := query.(*sparse.CSR).ToCSC()

In [91]:
scores := make([]float64, numDocs)

for i := 0; i < numDocs; i++ {
    scores[i] = pairwise.CosineSimilarity(sparseQuery.ColView(0), sparseTfIdf.ColView(i))
}

In [92]:
import "gonum.org/v1/gonum/floats"

inds := make([]int, numDocs)

floats.Argsort(scores, inds)

In [93]:
// get the top 10
for i := 0; i < 10; i++ {
    index := inds[i]
    fmt.Printf("%f - %s\n", scores[index], productCorpus[index])
}

0.000013 - FC Nitra B v FK Puchov Football Resultado exacto Slovakia - 3. Liga FC Nitra B  :1 - 1:1 - 2:2 - 1:2 - 2:1 - 3:3 - 1:3 - 2:1 - 7:2 - 7:3 - 6:4 - 5:5 - 3:5 - 4:6 - 1:6 - 2:2 - 3:1 - 4:1 - 5:2 - 4:4 - 1:1 - 6:2 - 5:3 - 3:3 - 4:4 - 3:5 - 1:1 - 8:2 - 6:3 - 5:4 - 2:4 - 4:5 - 2:6 - 3:7 - 1:7 - 2:8 - 1
0.233635 - Ireland - 1st Div 2018 - Winner Football Outright winner Ireland - 1st Div   :Galway FC:Drogheda:Finn Harps:Longford:Shelbourne:Cobh Ramblers:Uni College Dublin:Cabinteely FC:Wexford Youths:Athlone Town
0.000000 - Dolgoprudny 4 v 1 FC Luki-SKIF Velikiye Luki Football Full Time Result Football Live Dolgoprudny  :Dolgoprudny:Draw:FC Luki-SKIF Velikiye Luki
0.000000 - Rabotnicki 85 v 84 MZT Skopje Basketball Match Handicap (Inc. OT) Basketball Live Rabotnicki  :Rabotnicki:MZT Skopje
0.000000 - Anastasia Pavlyuchenkova* 1 v 0 Zarina Diyas Tennis Total Sets Live. - 2.5 Tennis Live Anastasia Pavlyuchenkova  :Under (2.5):Over (2.5)
0.000000 - 20.34 Crayford - Extra Place Race Gre

In [74]:
// Get the top scoring index
topIndex := inds[len(inds)-1]
fmt.Printf("%d", topIndex)

422165
<nil>


In [75]:
// get the top score
topScore := scores[topIndex]
fmt.Println("%f", topScore)

%f 0
5
<nil>


In [76]:
// get that item
fmt.Println("%s", productCorpus[topIndex])

%s Crystal Palace v Manchester United Football Request-a-Bet Specials Premier League Crystal Palace  :Player A to score 40 or more Premier League goals 16/17:Team B to go unbeaten in the 16/17 Premier League season
215
<nil>


In [34]:
ldaQuery, _ := ldafVectorisor.Transform(doc)

In [87]:
import "gonum.org/v1/gonum/mat"

highestSimilarity := -1.0
var matched int
for i := 0; i < docs; i++ {
    similarity := pairwise.CosineSimilarity(ldaQuery.(mat.ColViewer).ColView(0), ldaMatrix.(mat.ColViewer).ColView(i))
    if similarity < highestSimilarity {
        matched = i
        highestSimilarity = similarity
    }
}

In [88]:
fmt.Printf("%f, %s", highestSimilarity, productCorpus[matched])

-1.000000, Canberra Raiders 21 v 20 Manly Sea Eagles Rugby League Match Alternative Handicap 4 Rugby League Live Canberra Raiders  :Canberra Raiders:Manly-Warringah Sea Eagles175
<nil>
