# Ingest dataset into a sparse matrix

Anton Antonov  
RakuForPrediction at WordPress   
July, October 2025

----

## Setup

In [None]:
use Data::Reshapers;
use Math::SparseMatrix :ALL;
use Math::SparseMatrix::DOK;
use Math::SparseMatrix::Utilities;

use ML::SparseMatrixRecommender;
use ML::TriesWithFrequencies;

In [None]:
#% javascript
require.config({
     paths: {
     d3: 'https://d3js.org/d3.v7.min'
}});

require(['d3'], function(d3) {
     console.log(d3);
});

In [None]:
#% js
js-d3-list-line-plot(10.rand xx 40, background => 'none', stroke-width => 2)

In [None]:
my $title-color = 'Silver';
my $stroke-color = 'SlateGray';
my $tooltip-color = 'LightBlue';
my $tooltip-background-color = 'none';
my $tick-labels-font-size = 10;
my $tick-labels-color = 'Silver';
my $tick-labels-font-family = 'Helvetica';
my $background = '#1F1F1F';
my $color-scheme = 'schemeTableau10';
my $color-palette = 'Inferno';
my $edge-thickness = 3;
my $vertex-size = 6;
my $mmd-theme = q:to/END/;
%%{
  init: {
    'theme': 'forest',
    'themeVariables': {
      'lineColor': 'Ivory'
    }
  }
}%%
END
my %force = collision => {iterations => 0, radius => 10},link => {distance => 180};
my %force2 = charge => {strength => -30, iterations => 4}, collision => {radius => 50, iterations => 4}, link => {distance => 30};

my %opts = :$background, :$title-color, :$edge-thickness, :$vertex-size;

----

## Ingestion

In [99]:
my @titanic = Data::Reshapers::get-titanic-dataset(headers => 'auto');
sink records-summary(@titanic);

+-----------------+----------------+-------------------+----------------+---------------+
| id              | passengerClass | passengerSurvival | passengerAge   | passengerSex  |
+-----------------+----------------+-------------------+----------------+---------------+
| 913     => 1    | 3rd => 709     | died     => 809   | 20      => 334 | male   => 843 |
| 754     => 1    | 1st => 323     | survived => 500   | -1      => 263 | female => 466 |
| 517     => 1    | 2nd => 277     |                   | 30      => 258 |               |
| 1128    => 1    |                |                   | 40      => 190 |               |
| 174     => 1    |                |                   | 50      => 88  |               |
| 1201    => 1    |                |                   | 60      => 57  |               |
| 634     => 1    |                |                   | 0       => 56  |               |
| (Other) => 1302 |                |                   | (Other) => 63  |               |
+---------

In [100]:
#% html
@titanic.pick(12) ==> to-html()

id,passengerAge,passengerSurvival,passengerSex,passengerClass
23,30,survived,male,1st
994,-1,survived,female,3rd
766,30,survived,female,3rd
178,40,survived,male,1st
184,40,survived,male,1st
91,30,survived,female,1st
661,20,survived,female,3rd
505,40,died,male,2nd
717,30,died,male,3rd
383,-1,died,female,2nd


---

## SMR matrix creation and plotting

In [None]:
my ML::SparseMatrixRecommender $smrObj .= new;

my $mat = $smrObj.create-from-wide-form(@titanic, tag-types => Whatever, item-column-name => <id> ).take-M;


In [None]:
#% js
js-d3-list-plot($mat.tuples, :$background, :700width, :!axes)

In [None]:
#% js
my @ds3D = $mat.tuples.map({ <y x z tooltip>.Array Z=> [|$_.Array, "⎡{$mat.row-names[$_[0]]}⎦ : ⎡{$mat.column-names[$_[1]]}⎦ : {$_.tail}"] })».Hash;
js-d3-matrix-plot(@ds3D, 
    :800height, 
    :500width,
    :$tooltip-background-color, 
    :$tooltip-color, 
    :$background, 
)

In [None]:
#%js
 my %opts = margins => {top => 30, left => 16, right => 16, bottom => 16}, :$tick-labels-font-size, :$tick-labels-color, :$title-color, :tooltip, :$color-palette;
$mat[0..50].Array ==> js-d3-matrix-plot(width=>300, |%opts)

----

## Two ways of cross-tabulation

In [101]:
#% html
@titanic.pick(12)
==> to-html()

id,passengerSex,passengerAge,passengerClass,passengerSurvival
1207,male,0,3rd,died
150,female,40,1st,survived
1171,male,-1,3rd,died
1110,male,-1,3rd,died
241,male,40,1st,survived
1185,male,-1,3rd,died
1078,female,-1,3rd,survived
737,male,60,3rd,died
359,female,40,2nd,survived
547,female,30,2nd,survived


In [103]:
cross-tabulate(@titanic, 'passengerClass', 'passengerSex')
==> to-pretty-table

+-----+------+--------+
|     | male | female |
+-----+------+--------+
| 1st | 179  |  144   |
| 2nd | 171  |  106   |
| 3rd | 493  |  216   |
+-----+------+--------+

Cannot be used:

In [104]:
my @rules = cross-tabulate(@titanic, 'id', 'passengerSex').kv.map( -> $k, %v { %v.map({ ($k, $_.key) => $_.value }) }).flat;
deduce-type(@rules)

Vector(Pair(Vector(Atom((Str)), 2), Atom((Int))), 1309)

Making an "edge dataset" works (and it is implemented in both "Math::SparseMatrix" and "ML::SparseMatrixRecommender"):

In [105]:
my @dataset = cross-tabulate(@titanic, 'id', 'passengerSex').kv.map( -> $k, %v { %v.map({ %(from => $k, to => $_.key, weight => $_.value) }) }).map(*.Slip);
deduce-type(@dataset)

Vector(Struct([from, to, weight], [Str, Str, Int]), 1309)

In [106]:
Math::SparseMatrix.new(edge-dataset => @dataset):directed

Math::SparseMatrix(:specified-elements(1309), :dimensions((1309, 2)), :density(0.5))

---

## Tries

"Deeper" cross-tabulation with tries-with-frequencies:

In [None]:
sink my $tr = trie-create(@titanic.map(*<passengerSurvival passengerSex passengerClass>));

trie-form($tr)

TRIEROOT => 1309
├─died => 809
│ ├─female => 127
│ │ ├─1st => 5
│ │ ├─2nd => 12
│ │ └─3rd => 110
│ └─male => 682
│   ├─1st => 118
│   ├─2nd => 146
│   └─3rd => 418
└─survived => 500
  ├─female => 339
  │ ├─1st => 139
  │ ├─2nd => 94
  │ └─3rd => 106
  └─male => 161
    ├─1st => 61
    ├─2nd => 25
    └─3rd => 75