protoClassification

Install the development version of protoClassification from GitHub with:

# install.packages("pak")
pak::pak("acastroaraujo/protoClassification")

Get Started

To simulate a dataset you need to create to decide a couple of things first.

The number of $K$ dimensions.
The marginal probabilities for each dimension.
A correlation matrix for the dimensions.

library(protoClassification)
set.seed(1)
K <- 6 # 1st step
marginals <- rbeta(K, 2, 2) # 2nd step
rho <- rlkjcorr(1, K, eta = 1) # 3rd step

Generate data.

set.seed(1)
sim_data <- make_binary_data(marginals, rho, obs = 1e3)
sim_data
#> 
#> ── Data ──
#> 
#> 1000 obs. of  6 variables:
#>  $ x1: int  0 1 0 1 0 1 0 1 0 1 ...
#>  $ x2: int  0 1 0 1 1 1 0 1 1 1 ...
#>  $ x3: int  0 1 1 0 1 0 0 0 1 0 ...
#>  $ x4: int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ x5: int  1 1 1 1 0 0 1 1 1 1 ...
#>  $ x6: int  0 1 0 0 1 0 0 1 0 1 ...
#> 
#> ── Parameters ──
#> 
#> ── Marginal Probabilities:
#>   x1   x2   x3   x4   x5   x6 
#> 0.33 0.55 0.27 0.88 0.59 0.28
#> 
#> 
#> ── Correlation Matrix:
#>       x1    x2    x3    x4    x5    x6
#> x1  1.00  0.22  0.29  0.15 -0.02  0.36
#> x2  0.22  1.00 -0.08 -0.21 -0.01  0.37
#> x3  0.29 -0.08  1.00 -0.75  0.34  0.08
#> x4  0.15 -0.21 -0.75  1.00 -0.20 -0.34
#> x5 -0.02 -0.01  0.34 -0.20  1.00 -0.03
#> x6  0.36  0.37  0.08 -0.34 -0.03  1.00

Note. The parameters are stored as the params attribute in the output.

We can verify that the column means roughly correspond to the marginal probabilities.

colMeans(sim_data)
#>    x1    x2    x3    x4    x5    x6 
#> 0.326 0.554 0.274 0.899 0.588 0.275

In order to verify that the data follows the correlation structure in rho you would have to calculate a “tetrachoric correlation.”

psych::tetrachoric(sim_data)$rho
#>             x1          x2          x3         x4          x5          x6
#> x1  1.00000000  0.23375492  0.28118697  0.1442114 -0.01192531  0.40365755
#> x2  0.23375492  1.00000000 -0.13315127 -0.1448709  0.03404136  0.38467926
#> x3  0.28118697 -0.13315127  1.00000000 -0.7612102  0.40485490  0.07792155
#> x4  0.14421140 -0.14487091 -0.76121019  1.0000000 -0.31759978 -0.34664697
#> x5 -0.01192531  0.03404136  0.40485490 -0.3175998  1.00000000 -0.08953172
#> x6  0.40365755  0.38467926  0.07792155 -0.3466470 -0.08953172  1.00000000

Additional stuff for Prototype Classification Model:

w a vector of attention weights for each k
P a list of prototypes, one per category.
g (gamma) sensitivity parameter.

set.seed(1)
w <- runif(K)
w <- w / sum(w)
g <- 10

Calculate distance and similarity for one prototype at a time:

d <- calculateDistSim(
  data = sim_data,
  P = rep(1, K),
  w = w,
  g = g
)

str(d)
#> 'data.frame':    1000 obs. of  2 variables:
#>  $ distance  : num  0.655 0 0.477 0.457 0.145 ...
#>  $ similarity: num  0.00143 1 0.00846 0.01035 0.23423 ...

Calculate distance, similarity, and probabilities for multiple prototypes at the same time:

prototypes <- list(
  P1 = rep(1, K),
  P2 = rep(0, K),
  P3 = rep(1:0, K / 2)
)

g <- rep(10, 3)

out <- compute(sim_data, prototypes, w, g)
out
#> 
#> ── Output ──
#> 
#>  $ distance      1000 obs. of  3 variables
#>  $ similarity    1000 obs. of  3 variables
#>  $ probabilities 1000 obs. of  3 variables
#>  $ data          1000 obs. of  6 variables
#> 
#> ── Prototypes ──
#> 
#>  $ P1: num [1:6] 1 1 1 1 1 1
#>  $ P2: num [1:6] 0 0 0 0 0 0
#>  $ P3: int [1:6] 1 0 1 0 1 0
#> 
#> ── Distance ──
#> 
#> Manhattan (r = 1)
#> 
#> ── Sensitivity ──
#> 
#> g1 g2 g3 
#> 10 10 10
#> 
#> ── Attention Weights ──
#> 
#>    w1    w2    w3    w4    w5    w6 
#> 0.082 0.116 0.178 0.282 0.063 0.279
#> 
#> ── Marginal Probabilities ──
#> 
#> ── `colMeans(.$data)`
#>    x1    x2    x3    x4    x5    x6 
#> 0.326 0.554 0.274 0.899 0.588 0.275
#> 
#> ── `colMeans(.$probabilities)`
#>        C1        C2        C3 
#> 0.3883449 0.4360664 0.1755887

consolidate() the previous output into a single data frame for easier visualization.

d <- consolidate(out)
str(d)
#> 'data.frame':    1000 obs. of  15 variables:
#>  $ prob1: num  0.0379 0.9988 0.212 0.5783 0.998 ...
#>  $ prob2: num  8.45e-01 4.53e-05 1.34e-01 2.45e-01 8.26e-04 ...
#>  $ prob3: num  0.11692 0.00115 0.65354 0.17654 0.00115 ...
#>  $ sim1 : num  0.00143 1 0.00846 0.01035 0.23423 ...
#>  $ sim2 : num  3.18e-02 4.54e-05 5.36e-03 4.39e-03 1.94e-04 ...
#>  $ sim3 : num  0.0044 0.001149 0.026083 0.003159 0.000269 ...
#>  $ dist1: num  0.655 0 0.477 0.457 0.145 ...
#>  $ dist2: num  0.345 1 0.523 0.543 0.855 ...
#>  $ dist3: num  0.543 0.677 0.365 0.576 0.822 ...
#>  $ x1   : int  0 1 0 1 0 1 0 1 0 1 ...
#>  $ x2   : int  0 1 0 1 1 1 0 1 1 1 ...
#>  $ x3   : int  0 1 1 0 1 0 0 0 1 0 ...
#>  $ x4   : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ x5   : int  1 1 1 1 0 0 1 1 1 1 ...
#>  $ x6   : int  0 1 0 0 1 0 0 1 0 1 ...

Note. Since only binary data is implemented, there is no difference between Manhattan and Euclidean distance!

Marginal and Conditional Probabilities

So far, a single simulation requires the marginal probabilities for each element of $\mathbf{x}$ to be specified at the outset.

colMeans(out$data) # cf. `marginals` argument in `make_binary_data()`
#>    x1    x2    x3    x4    x5    x6 
#> 0.326 0.554 0.274 0.899 0.588 0.275

The more relevant piece of information we get from the compute() function is the .$probabilities object, which calculates the probability that any given individual in our simulated dataset will belong to each of the prototype categories.

This allows us to calculate the marginal probabilities for each category.

colMeans(out$probabilities)
#>        C1        C2        C3 
#> 0.3883449 0.4360664 0.1755887

With some ingenuity, we can use this information to get conditional probabilities too.

$$ \Pr(X_k = 1 \mid C = c) $$

conditionalProbs(out, "features")
#>           x1        x2         x3        x4        x5         x6
#> C1 0.5226571 0.7802353 0.32826487 0.9332301 0.6171392 0.62588499
#> C2 0.1302299 0.4297247 0.05791891 0.9465871 0.4820181 0.03806980
#> C3 0.3770770 0.3621551 0.69020964 0.7052639 0.7862111 0.08786438

$$ \Pr(X_k = 0 \mid C = c) $$

1 - conditionalProbs(out, "features")
#>           x1        x2        x3         x4        x5        x6
#> C1 0.4769570 0.2200126 0.6714007 0.06717440 0.3829206 0.3734904
#> C2 0.8703417 0.5706125 0.9425701 0.05316343 0.5189671 0.9620408
#> C3 0.6227137 0.6362298 0.3103042 0.29413893 0.2117976 0.9130266

$$ \Pr(C = c \mid X_k) $$

conditionalProbs(out, type = "categories")
#> $`Xk=0`
#>           C1        C2         C3
#> x1 0.2743383 0.5632908 0.16237092
#> x2 0.1926099 0.5577848 0.24960538
#> x3 0.3591157 0.5664215 0.07446281
#> x4 0.2566337 0.2286733 0.51469307
#> x5 0.3608689 0.5489563 0.09017476
#> x6 0.2001655 0.5788414 0.22099310
#> 
#> $`Xk=1`
#>           C1         C2         C3
#> x1 0.6236135 0.17344785 0.20293865
#> x2 0.5456643 0.33832130 0.11601444
#> x3 0.4652701 0.09116788 0.44356204
#> x4 0.4029833 0.45951724 0.13749944
#> x5 0.4073537 0.35719728 0.23544898
#> x6 0.8839345 0.06015273 0.05591273

Alternatively, it’s easier to use the summary() function to extract all conditional and marginal probabilities.

probs <- summary(out)
probs
#> 
#> ── Categories ──
#> 
#> ── Marginals:
#>    C1    C2    C3 
#> 0.388 0.436 0.176
#> 
#> ── Conditionals:
#> $`Xk=0`
#>       C1    C2    C3
#> x1 0.276 0.562 0.162
#> x2 0.193 0.558 0.249
#> x3 0.360 0.565 0.075
#> x4 0.258 0.231 0.511
#> x5 0.362 0.548 0.091
#> x6 0.202 0.578 0.220
#> 
#> $`Xk=1`
#>       C1    C2    C3
#> x1 0.624 0.173 0.202
#> x2 0.548 0.337 0.116
#> x3 0.467 0.092 0.441
#> x4 0.404 0.458 0.137
#> x5 0.409 0.357 0.234
#> x6 0.884 0.060 0.056
#> 
#> ── Features ──
#> 
#> ── Marginals:
#>    x1    x2    x3    x4    x5    x6 
#> 0.326 0.554 0.274 0.899 0.588 0.275
#> 
#> ── Conditionals:
#>       x1    x2    x3    x4    x5    x6
#> C1 0.523 0.779 0.329 0.933 0.618 0.625
#> C2 0.129 0.429 0.057 0.946 0.480 0.038
#> C3 0.378 0.364 0.691 0.705 0.788 0.088

Name		Name	Last commit message	Last commit date
Latest commit History 26 Commits
.github		.github
R		R
man		man
tests		tests
.Rbuildignore		.Rbuildignore
.Rprofile		.Rprofile
.gitignore		.gitignore
DESCRIPTION		DESCRIPTION
LICENSE		LICENSE
LICENSE.md		LICENSE.md
NAMESPACE		NAMESPACE
README.Rmd		README.Rmd
README.md		README.md
_pkgdown.yml		_pkgdown.yml

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

protoClassification

Get Started

Marginal and Conditional Probabilities

About

Licenses found

Uh oh!

Releases

Packages

Uh oh!

Contributors

Uh oh!

Languages

Folders and files

Latest commit

History

Repository files navigation

protoClassification

Get Started

Marginal and Conditional Probabilities

About

Resources

License

Licenses found

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Uh oh!

Contributors

Uh oh!

Languages

Packages