In [1]:
// To train the model in this example we'll use a data dump from the guardian
// which is contained in ./guardian
// First of all we will read the text files and create a corpus

In [2]:
import "io/ioutil"
files, _ := ioutil.ReadDir("./guardian")

In [3]:
var trainingData = make([]string, len(files))

In [4]:
import "fmt"

// Copy all of the files into the corpus
for i, file := range files {
    b, _ := ioutil.ReadFile(fmt.Sprintf("./guardian/%s", file.Name()))
    trainingData[i] = string(b)
}

In [5]:
// Now we have the training data we need to get our product data which is stored in CSV format
// For simplicity we will create a struct to unmarshal some of the data into

In [6]:
type Bet struct {
    ID          string `csv:"ID"`
    EventName   string `csv:"EventName"`
    Sport       string `csv:"Sport"`
    RawType     string `csv:"RawType"`
    Competition string `csv:"Competition"`
    Home        string `csv:"Home"`
    Away        string `csv:"Home"`
    Options     string `csv:"Options"`
}

// Each bet needs to be represented as a single line of the the corpus so write a quite helper function
func (b Bet) ToString() string {
    return fmt.Sprintf(
        "%s %s %s %s %s %s %s",
        b.EventName,
        b.Sport,
        b.RawType,
        b.Competition,
        b.Home,
        b.Away,
        b.Options,
    )
}

In [7]:
import (
    "os"
    
    "github.com/gocarina/gocsv"
)

// marshal our csv
betsFile, _ := os.OpenFile("bets.csv", os.O_RDWR|os.O_CREATE, os.ModePerm)

var bets []*Bet

gocsv.UnmarshalFile(betsFile, &bets)

betsFile.Close()

<nil>


In [8]:
// create our product (bet) corpus
var productCorpus = make([]string, len(bets))

for i, bet := range bets{
    productCorpus[i] = bet.ToString()
}

In [9]:
// Now all the data is gathered we can start building our pipelines.
// The first one will be a TDIDF which takes place after a count vectoriser

In [10]:
import "github.com/james-bowman/nlp"

var tfIdfPipeline = nlp.NewPipeline(
    nlp.NewCountVectoriser(),
    nlp.NewTfidfTransformer(),
)

In [11]:
// Fit the pipeline with the training data
tfIdfVectorisor := tfIdfPipeline.Fit(trainingData...)

Sports fans back governing bodies' support for marriage equality, study shows

Survey of dedicated fans finds 59%!a(MISSING)gree major codes were right to back the yes campaign in the same-sex marriage survey debate

The AFL temporarily changed the logo on its headquarters to show support for marriage equality. A survey of fans has found most supported governing bodies making a public stance.

Photograph: Paul Rovere/Getty Images

Most dedicated sports fans agree that governing bodies were right to support the marriage equality campaign, a survey has found.

The study by Monash University’s behavioural science laboratory and YouGov found 59%!o(MISSING)f those heavily engaged in sport (dubbed “superfans”) approved of sporting organisations’ involvement in the same-sex marriage survey debate – in which the AFL, NRL, ARU, FFA and Cricket Australia all came out in support of marriage equality.

The position of the sporting bodies during the campaign faced a chorus of public criticism, incl

In [18]:
// Transform the Product Corpus
tfIdfMatrix, _ := tfIdfVectorisor.Transform(productCorpus...)

In [13]:
// Repeat the steps for LDA. Create pipeline, Fit then Transform.
// For the LDA you define the number of topics which can be tuned.

In [14]:
var ldaTopics = 100

In [15]:
var ldaPipeline = nlp.NewPipeline(
    nlp.NewCountVectoriser(),
    nlp.NewLatentDirichletAllocation(ldaTopics),
)

In [16]:
// Fit the pipeline with the training data
ldafVectorisor := ldaPipeline.Fit(trainingData...)

In [19]:
ldaMatrix, _ := ldafVectorisor.Transform(productCorpus...)

In [20]:
import (
    "github.com/james-bowman/sparse"
    "github.com/james-bowman/nlp/measures/pairwise"
)

In [21]:
sparseTfIdf := tfIdfMatrix.(*sparse.CSR).ToCSC()

In [44]:
_, numDocs := sparseTfIdf.Dims()

In [156]:
var doc = `Large as the Ageas Bowl is, and loud as Sunday’s crowd were, Joe Root’s voice carried right across the outfield, “Jonny,” he said again. Down at third man, Jonny Bairstow turned and gave him a wave of his hand. “One second,” the gesture said. Bairstow had just stepped over the boundary rope between deliveries to talk to the children huddled behind the hoardings, four girls, four boys, all of them holding out match programmes, miniature bats and scorecards for him to sign. Away in the middle, Bairstow’s teammates waited while he signed one more and then promised the rest he would come back to do theirs later. The kids waited for him.Yaya Toure's agent has posted a cryptic message on Twitter claiming the Ivory Coast international has passed a medical in London.

Whoever first decided to lay this ground down in the Hampshire sticks, six miles outside Southampton, must have been thinking of that line from Field of Dreams: “If you build it, they will come.” At some point they decided that they needed more than a cricket pitch and a few grandstands to bring in the locals. There must be more sideshow attractions here than there are at any other ground in England.`

In [157]:
query, _ := tfIdfVectorisor.Transform(doc)
sparseQuery := query.(*sparse.CSR).ToCSC()

In [158]:
// Use a cosine similarity to compare our query doc vector with each of our product vectors
tfIdfScores := make([]float64, numDocs)

for i := 0; i < numDocs; i++ {
    tfIdfScores[i] = pairwise.CosineSimilarity(sparseQuery.ColView(0), sparseTfIdf.ColView(i))
}

In [159]:
ldaQuery, _ := ldafVectorisor.Transform(doc)

In [160]:
import "gonum.org/v1/gonum/mat"

ldaScores := make([]float64, numDocs)
for i := 0; i < docs; i++ {
    ldaScores[i] = pairwise.CosineSimilarity(ldaQuery.(mat.ColViewer).ColView(0), ldaMatrix.(mat.ColViewer).ColView(i))
}

In [167]:
// combine the scores with a weighting multiplier for each measure

tfIdfWeight := 10.0
ldaWeight := 1.0

combinedScores := make([]float64, numDocs)
for i := 0; i < docs; i++ {
    combinedScores[i] = tfIdfScores[i] * tfIdfWeight + ldaScores[i] * ldaWeight
}

In [168]:
// Use argsort to get order these
import "gonum.org/v1/gonum/floats"

inds := make([]int, numDocs)

floats.Argsort(combinedScores, inds)

In [169]:
// print out the top 10 of the scores
for i := 0; i < 10; i++{
    index := numDocs - 1 - i
    fmt.Printf("%f: %s\n", combinedScores[index], productCorpus[index])
}

2.684080: Egypt vs Uruguay Football 3W Handicap World Cup 2018 Egypt  :Home:Away:Home:Away:Home:Away
1.941195: Egypt vs Uruguay Football Exact score World Cup 2018 Egypt  :0:0:1:1:2:2:3:3:0:1:0:2:0:3:0:4:0:5:0:6:1:2:1:3:1:4:1:5:2:3:2:4:2:5:1:0:2:0:2:1:3:0:3:1:3:2
1.898969: UEFA Nations League 2018/19 - Winner Football Outright winner UEFA Nations League   :Spain:Germany:France:Belgium:Italy:England:Portugal:Croatia:Switzerland:Poland:Netherlands:Iceland
1.858263: Ireland - 1st Div 2018 - Winner Football Outright winner Ireland - 1st Div   :Galway FC:Drogheda:Finn Harps:Longford:Shelbourne:Cobh Ramblers:Uni College Dublin:Cabinteely FC:Wexford Youths:Athlone Town
1.816322: Jack Wilshere - Club after summer transfer window [Loan deals do count] Football Outright winner Transfers Specials   :Wolves:PSG:Everton:AC Milan:West Ham:Bournemouth:Crystal Palace:Juventus:Chelsea:Liverpool:Newcastle:Leicester:Manchester United :Southampton:To Stay at Arsenal
1.809074: James McClean - Club After Su