Skip to content

Commit

Permalink
Add helper struct for sorting ItemSignals by Wikidata ID
Browse files Browse the repository at this point in the history
  • Loading branch information
brawer committed May 13, 2024
1 parent 1f9b0fc commit 2ad50cb
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 0 deletions.
93 changes: 93 additions & 0 deletions cmd/qrank-builder/itemsignals.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,108 @@ package main
import (
"bytes"
"context"
"encoding/binary"
"fmt"
"os"
"regexp"
"time"

"github.com/klauspost/compress/zstd"
"github.com/lanrat/extsort"
"github.com/minio/minio-go/v7"
)

// ItemSignals contains ranking signals for Wikidata items.
type ItemSignals struct {
item int64 // eg 72 for Q72
pageviews int64
wikitextBytes int64
claims int64
identifiers int64
sitelinks int64
}

// If we ever want to rank signals for Wikidata lexemes, it would
// probably make sense to use a separate struct (written to a different
// output file) because it's likely not the same set of signals.
// For example, lexemes don't have pageviews, pagerank or wikitextBytes.
// https://github.com/brawer/wikidata-qrank/issues/37
// type LexemeSignals struct {}

func (s ItemSignals) ToBytes() []byte {
buf := make([]byte, binary.MaxVarintLen64*6)
p := binary.PutVarint(buf, s.item)
p += binary.PutVarint(buf[p:], s.pageviews)
p += binary.PutVarint(buf[p:], s.wikitextBytes)
p += binary.PutVarint(buf[p:], s.claims)
p += binary.PutVarint(buf[p:], s.identifiers)
p += binary.PutVarint(buf[p:], s.sitelinks)
return buf[0:p]
}

func ItemSignalsFromBytes(b []byte) extsort.SortType {
item, pos := binary.Varint(b)
pageviews, n := binary.Varint(b[pos:])
pos += n
wikitextBytes, n := binary.Varint(b[pos:])
pos += n
claims, n := binary.Varint(b[pos:])
pos += n
identifiers, n := binary.Varint(b[pos:])
pos += n
sitelinks, n := binary.Varint(b[pos:])
return ItemSignals{
item: item,
pageviews: pageviews,
wikitextBytes: wikitextBytes,
claims: claims,
identifiers: identifiers,
sitelinks: sitelinks,
}
}

func ItemSignalsLess(a, b extsort.SortType) bool {
aa, bb := a.(ItemSignals), b.(ItemSignals)

if aa.item < bb.item {
return true
} else if aa.item > bb.item {
return false
}

if aa.pageviews < bb.pageviews {
return true
} else if aa.pageviews > bb.pageviews {
return false
}

if aa.wikitextBytes < bb.wikitextBytes {
return true
} else if aa.wikitextBytes > bb.wikitextBytes {
return false
}

if aa.claims < bb.claims {
return true
} else if aa.claims > bb.claims {
return false
}

if aa.identifiers < bb.identifiers {
return true
} else if aa.identifiers > bb.identifiers {
return false
}

if aa.sitelinks < bb.sitelinks {
return true
} else if aa.sitelinks > bb.sitelinks {
return false
}

return false
}

// BuildItemSignals builds per-item signals and puts them in storage.
// If the signals file is already in storage, it does not get re-built.
func buildItemSignals(ctx context.Context, pageviews []string, sites *map[string]WikiSite, s3 S3) (time.Time, error) {
Expand Down
57 changes: 57 additions & 0 deletions cmd/qrank-builder/itemsignals_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,68 @@ import (
"bytes"
"context"
"log"
"reflect"
"slices"
"testing"
"time"
)

func TestItemSignalsToBytes(t *testing.T) {
// Serialize and then de-serialize an ItemSignals struct.
a := ItemSignals{1, 2, 3, 4, 5, 6}
got := ItemSignalsFromBytes(a.ToBytes()).(ItemSignals)
if !reflect.DeepEqual(got, a) {
t.Errorf("got %v, want %v", got, a)
}
}

func TestItemSignalsLess(t *testing.T) {
for _, tc := range []struct {
a string
b string
want bool
}{
{"123456", "123456", false},
{"923456", "123456", false},
{"123456", "923456", true},

{"------", "------", false},
{"7-----", "------", false},
{"-7----", "------", false},
{"--7---", "------", false},
{"---7--", "------", false},
{"----7-", "------", false},
{"-----7", "------", false},
{"------", "7-----", true},
{"------", "-7----", true},
{"------", "--7---", true},
{"------", "---7--", true},
{"------", "----7-", true},
{"------", "-----7", true},
} {
a := ItemSignals{
item: int64(tc.a[0]),
pageviews: int64(tc.a[1]),
wikitextBytes: int64(tc.a[2]),
claims: int64(tc.a[3]),
identifiers: int64(tc.a[4]),
sitelinks: int64(tc.a[5]),
}
b := ItemSignals{
item: int64(tc.b[0]),
pageviews: int64(tc.b[1]),
wikitextBytes: int64(tc.b[2]),
claims: int64(tc.b[3]),
identifiers: int64(tc.b[4]),
sitelinks: int64(tc.b[5]),
}
got := ItemSignalsLess(a, b)
if got != tc.want {
t.Errorf("got %v, want %v, for ItemSignalsLess(%#v, %#v)", got, tc.want, a, b)
}
}
}

func TestBuildItemSignals(t *testing.T) {
logger = log.New(&bytes.Buffer{}, "", log.Lshortfile)
ctx := context.Background()
Expand Down

0 comments on commit 2ad50cb

Please sign in to comment.