In [1]:
// To train the model in this example we'll use a data dump from the guardian
// which is contained in ./guardian
// First of all we will read the text files and create a corpus

In [2]:
import "io/ioutil"
files, _ := ioutil.ReadDir("./guardian")

In [3]:
var trainingData = make([]string, len(files))

In [4]:
import "fmt"

// Copy all of the files into the corpus
for i, file := range files {
    b, _ := ioutil.ReadFile(fmt.Sprintf("./guardian/%s", file.Name()))
    trainingData[i] = string(b)
}

In [5]:
// Now we have the training data we need to get our product data which is stored in CSV format
// For simplicity we will create a struct to unmarshal some of the data into

In [6]:
type Bet struct {
    ID          string `csv:"ID"`
    EventName   string `csv:"EventName"`
    Sport       string `csv:"Sport"`
    RawType     string `csv:"RawType"`
    Competition string `csv:"Competition"`
    Home        string `csv:"Home"`
    Away        string `csv:"Home"`
    Options     string `csv:"Options"`
}

// Each bet needs to be represented as a single line of the the corpus so write a quite helper function
func (b Bet) ToString() string {
    return fmt.Sprintf(
        "%s %s %s %s %s %s %s",
        b.EventName,
        b.Sport,
        b.RawType,
        b.Competition,
        b.Home,
        b.Away,
        b.Options,
    )
}

In [7]:
import (
    "os"
    
    "github.com/gocarina/gocsv"
)

// marshal our csv
betsFile, _ := os.OpenFile("bets.csv", os.O_RDWR|os.O_CREATE, os.ModePerm)

var bets []*Bet

gocsv.UnmarshalFile(betsFile, &bets)

betsFile.Close()

<nil>


In [8]:
// create our product (bet) corpus
var productCorpus = make([]string, len(bets))

for i, bet := range bets{
    productCorpus[i] = bet.ToString()
}

In [9]:
// Now all the data is gathered we can start building our pipelines.
// The first one will be a TDIDF which takes place after a count vectoriser

In [10]:
import "github.com/james-bowman/nlp"

var tfIdfPipeline = nlp.NewPipeline(
    nlp.NewCountVectoriser(),
    nlp.NewTfidfTransformer(),
)

In [250]:
// Fit the pipeline with the training data
tfIdfVectorisor := tfIdfPipeline.Fit(trainingData...)

In [18]:
// Transform the Product Corpus
tfIdfMatrix, _ := tfIdfVectorisor.Transform(productCorpus...)

In [13]:
// Repeat the steps for LDA. Create pipeline, Fit then Transform.
// For the LDA you define the number of topics which can be tuned.

In [14]:
var ldaTopics = 100

In [15]:
var ldaPipeline = nlp.NewPipeline(
    nlp.NewCountVectoriser(),
    nlp.NewLatentDirichletAllocation(ldaTopics),
)

In [16]:
// Fit the pipeline with the training data
ldafVectorisor := ldaPipeline.Fit(trainingData...)

In [19]:
ldaMatrix, _ := ldafVectorisor.Transform(productCorpus...)

In [20]:
import (
    "github.com/james-bowman/sparse"
    "github.com/james-bowman/nlp/measures/pairwise"
)

In [21]:
sparseTfIdf := tfIdfMatrix.(*sparse.CSR).ToCSC()

In [44]:
_, numDocs := sparseTfIdf.Dims()

In [242]:
var doc = `Salford moved clear at the top of the Super 8s Qualifiers after securing a third straight win and clinically dispatching Halifax 62-4 at the MBi Shay Stadium.

The Red Devils beat Hull KR and Widnes in their opening two fixtures and this third victory sees them take a significant step towards Super League survival.

Ian Watson's side had too much quality for their hosts and tries from Jackson Hastings, Ed Chamberlain, Derrell Olpherts (two), Ben Nakubuwai and Kris Welham helped them into a 30-4 interval lead.

From there Salford did not look back and second-half scores from Olpherts, Lama Tasi, Junior Sa'u (two), Hastings and Josh Wood completed a handsome 12-try victory.

Halifax, who opened the scoring through Sam Wood, remain without a point to their name in the Qualifiers after previous defeats to Toronto and Toulouse.

They led in the ninth minute when centre Ben Heaton embarked on a fine run down the left-hand touchline before sending Wood over for a try which Shane Grady failed to convert.

Yet the advantage did not last long as Salford hit back when Hastings showed impressive strength to drive through the Halifax defence inside the left channel.

Chamberlain missed the conversion but Halifax were dealt a blow in the 18th minute when prop Jordan Baldwinson was forced off with a leg injury and replaced by Jacob Fairbank.

Moments later the Red Devils claimed a fine score when Hastings' high bomb was expertly fielded by Welham, whose offload found Robert Lui in the right channel.

Lui found Chamberlain advancing at pace and he touched down in the right corner for a try he then converted.`

In [243]:
query, _ := tfIdfVectorisor.Transform(doc)
sparseQuery := query.(*sparse.CSR).ToCSC()

In [244]:
// Use a cosine similarity to compare our query doc vector with each of our product vectors
tfIdfScores := make([]float64, numDocs)

for i := 0; i < numDocs; i++ {
    tfIdfScores[i] = pairwise.CosineSimilarity(sparseQuery.ColView(0), sparseTfIdf.ColView(i))
}

In [245]:
ldaQuery, _ := ldafVectorisor.Transform(doc)

In [246]:
import "gonum.org/v1/gonum/mat"

ldaScores := make([]float64, numDocs)
for i := 0; i < docs; i++ {
    ldaScores[i] = pairwise.CosineSimilarity(ldaQuery.(mat.ColViewer).ColView(0), ldaMatrix.(mat.ColViewer).ColView(i))
}

In [247]:
// combine the scores with a weighting multiplier for each measure

tfIdfWeight := 5.0
ldaWeight := 1.0

combinedScores := make([]float64, numDocs)
for i := 0; i < docs; i++ {
    combinedScores[i] = (tfIdfScores[i]*tfIdfWeight + ldaScores[i]*ldaWeight) / tfIdfWeight * ldaWeight
}

In [248]:
// Use argsort to get order these
import "gonum.org/v1/gonum/floats"

inds := make([]int, numDocs)

floats.Argsort(combinedScores, inds)

In [249]:
// print out the top 10 of the scores
for i := 0; i < 10; i++ {
    index := numDocs - 1 - i
    fmt.Printf("%f: %s\n", combinedScores[index], productCorpus[inds[index]])
}

0.507461: Salford v Huddersfield Rugby League RequestABet Specials Up to 5/1 Super League Salford Red Devils  :Huddersfield to Score First, Lead at Half Time & Win the Match:Salford to Score First, Lead at Half Time & Win the Match
0.464580: Salford v Huddersfield Rugby League RequestABet Specials 11/1 to 28/1 Super League Salford Red Devils  :Rankin, Mcgillvary and Sa’u tryscorers, Giants to win and 30+ points in match:Robert Lui, Niall Evalds & Weller Hauraki to score tries
0.452839: Salford v Huddersfield Rugby League RequestABet Specials 33/1 to 66/1 Super League Salford Red Devils  :Danny Brough To Score 1st try & Huddersfield To Win:Craig Kopczak To Score 1st try & Salford To Win
0.443542: Halifax v Featherstone Rugby League Match Result Kingstone Press Championship Halifax RLFC  :Halifax RLFC:Drawn Match:Featherstone
0.424202: Salford v Huddersfield Rugby League RequestABet Specials 80/1 and above Super League Salford Red Devils  :Jermaine McGilvary 1st tryscorer and Jake Mamo l