# Ingest dataset into a sparse matrix

Anton Antonov  
RakuForPrediction at WordPress   
July, October 2025

----

## Setup

In [None]:
use Data::Reshapers;
use Math::SparseMatrix :ALL;
use Math::SparseMatrix::DOK;
use Math::SparseMatrix::Utilities;

use ML::SparseMatrixRecommender;
use ML::TriesWithFrequencies;

In [None]:
#% javascript
require.config({
     paths: {
     d3: 'https://d3js.org/d3.v7.min'
}});

require(['d3'], function(d3) {
     console.log(d3);
});

In [None]:
#% js
js-d3-list-line-plot(10.rand xx 40, background => 'none', stroke-width => 2)

In [None]:
my $title-color = 'Silver';
my $stroke-color = 'SlateGray';
my $tooltip-color = 'LightBlue';
my $tooltip-background-color = 'none';
my $tick-labels-font-size = 10;
my $tick-labels-color = 'Silver';
my $tick-labels-font-family = 'Helvetica';
my $background = '#1F1F1F';
my $color-scheme = 'schemeTableau10';
my $color-palette = 'Inferno';
my $edge-thickness = 3;
my $vertex-size = 6;
my $mmd-theme = q:to/END/;
%%{
  init: {
    'theme': 'forest',
    'themeVariables': {
      'lineColor': 'Ivory'
    }
  }
}%%
END
my %force = collision => {iterations => 0, radius => 10},link => {distance => 180};
my %force2 = charge => {strength => -30, iterations => 4}, collision => {radius => 50, iterations => 4}, link => {distance => 30};

my %opts = :$background, :$title-color, :$edge-thickness, :$vertex-size;

----

## Ingestion

In [21]:
my @titanic = Data::Reshapers::get-titanic-dataset(headers => 'auto');
sink records-summary(@titanic);

+----------------+----------------+-----------------+---------------+-------------------+
| passengerAge   | passengerClass | id              | passengerSex  | passengerSurvival |
+----------------+----------------+-----------------+---------------+-------------------+
| 20      => 334 | 3rd => 709     | 494     => 1    | male   => 843 | died     => 809   |
| -1      => 263 | 1st => 323     | 433     => 1    | female => 466 | survived => 500   |
| 30      => 258 | 2nd => 277     | 1240    => 1    |               |                   |
| 40      => 190 |                | 445     => 1    |               |                   |
| 50      => 88  |                | 843     => 1    |               |                   |
| 60      => 57  |                | 1051    => 1    |               |                   |
| 0       => 56  |                | 986     => 1    |               |                   |
| (Other) => 63  |                | (Other) => 1302 |               |                   |
+---------

In [22]:
#% html
@titanic.pick(12) ==> to-html()

passengerSurvival,passengerSex,passengerClass,id,passengerAge
survived,female,1st,70,-1
died,male,3rd,1069,60
survived,male,2nd,377,20
died,male,3rd,621,30
survived,male,1st,255,-1
died,male,3rd,1112,0
died,male,3rd,1151,-1
died,male,3rd,1208,10
survived,female,2nd,461,20
died,male,3rd,844,-1


---

## SMR matrix creation and plotting

In [None]:
my ML::SparseMatrixRecommender $smrObj .= new;

my $mat = $smrObj.create-from-wide-form(@titanic, tag-types => Whatever, item-column-name => <id> ).take-M;


In [None]:
#% js
js-d3-list-plot($mat.tuples, :$background, :700width, :!axes)

In [None]:
#% js
my @ds3D = $mat.tuples.map({ <y x z tooltip>.Array Z=> [|$_.Array, "⎡{$mat.row-names[$_[0]]}⎦ : ⎡{$mat.column-names[$_[1]]}⎦ : {$_.tail}"] })».Hash;
js-d3-matrix-plot(@ds3D, 
    :800height, 
    :500width,
    :$tooltip-background-color, 
    :$tooltip-color, 
    :$background, 
)

In [None]:
#%js
 my %opts = margins => {top => 30, left => 16, right => 16, bottom => 16}, :$tick-labels-font-size, :$tick-labels-color, :$title-color, :tooltip, :$color-palette;
$mat[0..50].Array ==> js-d3-matrix-plot(width=>300, |%opts)

---- 

## Long form

In [None]:
my @dsTitanicLongForm = to-long-format(@titanic, 'id');
deduce-type(@dsTitanicLongForm)

In [None]:
#% html
@dsTitanicLongForm.pick(12)
==> to-html(field-names => <id Variable Value>)

In [None]:
my $smrObjLong = ML::SparseMatrixRecommender.new
.create-from-long-form(@dsTitanicLongForm, item-column-name => 'id', tag-type-column-name => 'Variable', tag-column-name => 'Value')

----

## Two ways of cross-tabulation

In [None]:
#% html
@titanic.pick(12)
==> to-html()

In [None]:
cross-tabulate(@titanic, 'passengerClass', 'passengerSex')
==> to-pretty-table

Cannot be used:

In [None]:
my @rules = cross-tabulate(@titanic, 'id', 'passengerSex').kv.map( -> $k, %v { %v.map({ ($k, $_.key) => $_.value }) }).flat;
deduce-type(@rules)

Making an "edge dataset" works (and it is implemented in both "Math::SparseMatrix" and "ML::SparseMatrixRecommender"):

In [None]:
my @dataset = cross-tabulate(@titanic, 'id', 'passengerSex').kv.map( -> $k, %v { %v.map({ %(from => $k, to => $_.key, weight => $_.value) }) }).map(*.Slip);
deduce-type(@dataset)

In [None]:
Math::SparseMatrix.new(edge-dataset => @dataset):directed

---

## Tries

"Deeper" cross-tabulation with tries-with-frequencies:

In [23]:
sink my $tr = trie-create(@titanic.map(*<passengerSurvival passengerSex passengerClass>));

trie-form($tr)

TRIEROOT => 1309
├─died => 809
│ ├─female => 127
│ │ ├─1st => 5
│ │ ├─2nd => 12
│ │ └─3rd => 110
│ └─male => 682
│   ├─1st => 118
│   ├─2nd => 146
│   └─3rd => 418
└─survived => 500
  ├─female => 339
  │ ├─1st => 139
  │ ├─2nd => 94
  │ └─3rd => 106
  └─male => 161
    ├─1st => 61
    ├─2nd => 25
    └─3rd => 75