# Cohort features

In [1]:
# Libraries
suppressPackageStartupMessages(library(tidyverse))

# Global
options(warn = -1)

# Scripts
source("summarize.R")

# Data
load("DF.Rdata")

glimpse(DF)

Observations: 411
Variables: 23
$ registry_id            <dbl> 20060035601, 19970063502, 20040008703, 2007008…
$ pt_stage               <fct> T1, T1, T2, T1, T1, T1, Tis, T1, T2, Ta, Ta, T…
$ sp_id                  <chr> "06-S-4802", "07-S-3788", "07-S-8931", "07-S-6…
$ ck56                   <dbl> 60, 10, 90, NA, NA, 5, NA, 0, 0, 40, NA, 40, 4…
$ ck20                   <dbl> 0, 0, 0, NA, 0, 70, NA, 80, 0, 0, NA, 0, 10, N…
$ cd44                   <dbl> 70, 60, NA, NA, NA, 40, NA, 5, NA, 60, NA, NA,…
$ gata3                  <dbl> 100, 100, 100, NA, 100, 100, NA, 100, 100, 100…
$ er                     <dbl> 40, 20, NA, NA, 0, 0, 0, 0, 0, 5, NA, NA, 0, N…
$ her2                   <dbl> 0, 5, NA, NA, 60, 30, NA, 40, 60, 60, NA, 30, …
$ uroplakin              <dbl> 5, 20, NA, NA, NA, 0, NA, 10, NA, 0, NA, NA, 0…
$ sex                    <fct> Male, Male, Male, Male, Male, Male, Female, Ma…
$ death                  <fct> Dead, Alive, Dead, Alive, Alive, Alive, Alive,…
$ progression_stage_

## Clinical and outcome features
Clinical and outcome features are analyzed at the patient level.

In [2]:
# Tidying up the clinical data
CLINICAL <- DF %>% 
    group_by(registry_id) %>% 
    select(
        registry_id,
        age_dx,
        sex,
        fu_mo,
        recurrence_any,
        progression_stage_any,
        progression_grade_any,
        death
    ) %>% 
    distinct() %>%
    ungroup()

glimpse(CLINICAL)

Observations: 60
Variables: 8
$ registry_id           <dbl> 20060035601, 19970063502, 20040008703, 20070089…
$ age_dx                <dbl> 77, 71, 89, 59, 76, 68, 59, 71, 60, 56, 78, 89,…
$ sex                   <fct> Male, Male, Male, Male, Male, Male, Female, Mal…
$ fu_mo                 <dbl> 42.6, 36.0, 3.9, 43.5, 39.1, 16.3, 66.6, 51.5, …
$ recurrence_any        <fct> No tumor recurrence, Tumor recurrence, Tumor re…
$ progression_stage_any <fct> No stage progression, Stage progression, Stage …
$ progression_grade_any <fct> No grade progression, No grade progression, No …
$ death                 <fct> Dead, Alive, Dead, Alive, Alive, Alive, Alive, …


### Age, in years

In [3]:
CLINICAL %>% summarize_num(age_dx)

# A tibble: 1 x 8
      N  Mean    SD Median   IQR   Min   Max Missing
  <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1    60  68.0  9.86     68  13.5    47    89       0


### Sex

In [4]:
CLINICAL %>% summarize_fct(sex)

# A tibble: 2 x 3
  Levels     N  Freq
  <fct>  <int> <dbl>
1 Female    19  31.7
2 Male      41  68.3


### Follow-up, in months

In [5]:
CLINICAL %>% summarize_num(fu_mo)

# A tibble: 1 x 8
      N  Mean    SD Median   IQR   Min   Max Missing
  <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1    60  42.7  46.9   39.4  35.7   2.1  275.       0


### Tumor recurrence at any biopsy

In [6]:
CLINICAL %>% summarize_fct(recurrence_any)

# A tibble: 2 x 3
  Levels                  N  Freq
  <fct>               <int> <dbl>
1 Tumor recurrence       52  86.7
2 No tumor recurrence     8  13.3


### Tumor grade progression at any biopsy

In [7]:
CLINICAL %>% summarize_fct(progression_grade_any)

# A tibble: 2 x 3
  Levels                   N  Freq
  <fct>                <int> <dbl>
1 Grade progression        5   8.3
2 No grade progression    55  91.7


### Tumor stage progression at any biopsy

In [8]:
CLINICAL %>% summarize_fct(progression_stage_any)

# A tibble: 2 x 3
  Levels                   N  Freq
  <fct>                <int> <dbl>
1 Stage progression        6    10
2 No stage progression    54    90


### Overall mortality

In [9]:
CLINICAL %>% summarize_fct(death)

# A tibble: 3 x 3
  Levels     N  Freq
  <fct>  <int> <dbl>
1 Alive     48    80
2 Dead       6    10
3 <NA>       6    10


## Pathologic features
This section includes the pathologic features of the cases that were included in the dataset. For the histologic diagnosis, "CIS" includes carcinoma in situ and dysplasia, "LG" and "HG" mean low-grade and high-grade noninvasive papillary urothelial carcinoma, respectively.

In [10]:
# Tidying up the pathologic data
PATHOLOGIC <- DF %>% 
    group_by(sp_id) %>% 
    select(
        sp_id,
        histo_dx,
        pt_stage,
        recurrence_next,
        progression_grade_next,
        progression_stage_next
    ) %>% 
    distinct() %>% 
    ungroup()

glimpse (PATHOLOGIC)

Observations: 193
Variables: 6
$ sp_id                  <chr> "06-S-4802", "07-S-3788", "07-S-8931", "07-S-6…
$ histo_dx               <fct> HG, HG, HG, HG, HG, HG, CIS, HG, Invasive, HG,…
$ pt_stage               <fct> T1, T1, T2, T1, T1, T1, Tis, T1, T2, Ta, Ta, T…
$ recurrence_next        <fct> No tumor recurrence, Tumor recurrence, No tumo…
$ progression_grade_next <fct> No grade progression, No grade progression, No…
$ progression_stage_next <fct> No stage progression, No stage progression, No…


### Histologic diagnosis

In [11]:
PATHOLOGIC %>% summarize_fct(histo_dx)

# A tibble: 4 x 3
  Levels       N  Freq
  <fct>    <int> <dbl>
1 CIS         13   6.7
2 LG          60  31.1
3 HG          79  40.9
4 Invasive    41  21.2


### pT stage

In [12]:
PATHOLOGIC %>% summarize_fct(pt_stage)

# A tibble: 5 x 3
  Levels     N  Freq
  <fct>  <int> <dbl>
1 Tis        9   4.7
2 Ta       102  52.8
3 T1        66  34.2
4 T2        10   5.2
5 <NA>       6   3.1


### Tumor recurrence at next biopsy

In [13]:
PATHOLOGIC %>% summarize_fct(recurrence_next)

# A tibble: 3 x 3
  Levels                  N  Freq
  <fct>               <int> <dbl>
1 Tumor recurrence      102  52.8
2 No tumor recurrence    68  35.2
3 <NA>                   23  11.9


### Tumor grade progression at next biopsy

In [14]:
PATHOLOGIC %>% summarize_fct(progression_grade_next)

# A tibble: 3 x 3
  Levels                   N  Freq
  <fct>                <int> <dbl>
1 Grade progression        6   3.1
2 No grade progression   170  88.1
3 <NA>                    17   8.8


### Tumor stage progression at next biopsy

In [15]:
PATHOLOGIC %>% summarize_fct(progression_stage_next)

# A tibble: 3 x 3
  Levels                   N  Freq
  <fct>                <int> <dbl>
1 Stage progression        9   4.7
2 No stage progression   160  82.9
3 <NA>                    24  12.4


## Biomarkers features
Biomarkers features were established at the TMA level.

### Biomarker distribution

In [16]:
DF %>% 
    select(ck56:uroplakin) %>% 
    gather(key = "Biomarker", value = "Expression") %>% 
    summarize_nums(Expression, Biomarker)

# A tibble: 7 x 9
  Levels        N  Mean    SD Median   IQR   Min   Max Missing
  <fct>     <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 cd44        411 51.5  33.6      60  60       0   100     106
2 ck20        411 28.8  35.7       5  62.5     0   100      75
3 ck56        411 28.1  29.2      20  35       0   100      76
4 er          411  1.66  6.79      0   0       0    60      74
5 gata3       411 99.1   5.05    100   0      50   100      73
6 her2        411 40    36.0      30  65       0   100      69
7 uroplakin   411 15.3  24.6       5  20       0   100      96

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 1260, df = 6, p-value < 2.2e-16


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

          cd44    ck20    ck56    er      gata3   her2   
ck20      < 2e-16 -       -       -       -       -      
ck56      2.9e-16 0.01987 -       -       -       -      
er        < 2e-16 < 2e-16 < 2e-16 -       -       -      
gata

### Biormarker and histology
#### CK5/6

In [17]:
DF %>% 
    mutate(
        marker = ck56,
        feature = histo_dx
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 4 x 9
  Levels       N  Mean    SD Median   IQR   Min   Max Missing
  <fct>    <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 CIS         21  30.4  33.7     15  35       0    90       9
2 LG         114  36.6  31.1     30  57.5     0   100      20
3 HG         168  24.7  27.0     10  35       0   100      33
4 Invasive   108  24.1  28.6     10  25       0   100      14

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 11.225, df = 3, p-value = 0.01057


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

         CIS   LG    HG   
LG       1.000 -     -    
HG       1.000 0.030 -    
Invasive 1.000 0.016 1.000

P value adjustment method: bonferroni 


#### CD44

In [18]:
DF %>% 
    mutate(
        marker = cd44,
        feature = histo_dx
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 4 x 9
  Levels       N  Mean    SD Median   IQR   Min   Max Missing
  <fct>    <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 CIS         21  71.4  25.1     80    40    20   100       7
2 LG         114  61.3  31.2     70    60     0   100      30
3 HG         168  45.0  33.9     40    70     0   100      45
4 Invasive   108  47.7  33.7     45    60     0   100      24

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 16.759, df = 3, p-value = 0.000792


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

         CIS    LG     HG    
LG       1.0000 -      -     
HG       0.0370 0.0064 -     
Invasive 0.0886 0.0699 1.0000

P value adjustment method: bonferroni 


#### CK20

In [19]:
DF %>% 
    mutate(
        marker = ck20,
        feature = histo_dx
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 4 x 9
  Levels       N  Mean    SD Median   IQR   Min   Max Missing
  <fct>    <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 CIS         21  36.5  42.9     10    85     0   100       8
2 LG         114  22.5  32.4      0    30     0   100      21
3 HG         168  31.1  36.1     10    70     0   100      32
4 Invasive   108  30.5  36.9      5    70     0   100      14

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 5.406, df = 3, p-value = 0.1444


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

         CIS  LG   HG  
LG       0.62 -    -   
HG       1.00 0.25 -   
Invasive 1.00 1.00 1.00

P value adjustment method: bonferroni 


#### GATA3

In [20]:
DF %>% 
    mutate(
        marker = gata3,
        feature = histo_dx
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 4 x 9
  Levels       N  Mean    SD Median   IQR   Min   Max Missing
  <fct>    <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 CIS         21 100    0       100     0   100   100       8
2 LG         114  98.9  6.10    100     0    50   100      19
3 HG         168  99.1  4.03    100     0    70   100      29
4 Invasive   108  99.0  5.60    100     0    60   100      17

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 1.002, df = 3, p-value = 0.8008


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

         CIS LG HG
LG       1   -  - 
HG       1   1  - 
Invasive 1   1  1 

P value adjustment method: bonferroni 


#### ER

In [21]:
DF %>% 
    mutate(
        marker = er,
        feature = histo_dx
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 4 x 9
  Levels       N  Mean    SD Median   IQR   Min   Max Missing
  <fct>    <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 CIS         21 1.43   5.35      0     0     0    20       7
2 LG         114 0.532  2.48      0     0     0    20      20
3 HG         168 1.23   5.33      0     0     0    40      30
4 Invasive   108 3.52  10.6       0     0     0    60      17

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 4.7291, df = 3, p-value = 0.1927


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

         CIS  LG   HG  
LG       1.00 -    -   
HG       1.00 1.00 -   
Invasive 1.00 0.23 1.00

P value adjustment method: bonferroni 


#### HER2

In [22]:
DF %>% 
    mutate(
        marker = her2,
        feature = histo_dx
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 4 x 9
  Levels       N  Mean    SD Median   IQR   Min   Max Missing
  <fct>    <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 CIS         21  51.2  48.4     55   100     0   100       9
2 LG         114  35.8  36.4     30    70     0   100      18
3 HG         168  38.2  34.2     30    55     0   100      28
4 Invasive   108  45.4  36.2     40    70     0   100      14

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 5.1666, df = 3, p-value = 0.16


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

         CIS  LG   HG  
LG       1.00 -    -   
HG       1.00 1.00 -   
Invasive 1.00 0.26 0.76

P value adjustment method: bonferroni 


#### Uroplakin

In [23]:
DF %>% 
    mutate(
        marker = uroplakin,
        feature = histo_dx
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 4 x 9
  Levels       N  Mean    SD Median   IQR   Min   Max Missing
  <fct>    <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 CIS         21  33.3  43.9    7.5    90     0   100       9
2 LG         114  12.6  17.8    5      20     0    70      27
3 HG         168  14.3  25.1    5      10     0   100      34
4 Invasive   108  17.3  25.6   10      20     0   100      26

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 4.2708, df = 3, p-value = 0.2337


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

         CIS  LG   HG  
LG       1.00 -    -   
HG       1.00 1.00 -   
Invasive 1.00 1.00 0.48

P value adjustment method: bonferroni 


### Biormarker and stage
#### CK5/6

In [24]:
DF %>% 
    mutate(
        marker = ck56,
        feature = pt_stage
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 5 x 9
  Levels     N  Mean    SD Median   IQR   Min   Max Missing
  <fct>  <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 Tis       15  25    28.7     20  25       0    90       7
2 Ta       203  30.6  29.2     20  42.5     0   100      41
3 T1       161  23.1  27.7     10  25       0   100      22
4 T2        21  31.5  33.9     20  43.8     0    90       1
5 <NA>      11  68.3  11.7     65  10      60    90       5

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 8.1625, df = 3, p-value = 0.04277


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

   Tis   Ta    T1   
Ta 1.000 -     -    
T1 1.000 0.025 -    
T2 1.000 1.000 1.000

P value adjustment method: bonferroni 


#### CD44

In [25]:
DF %>% 
    mutate(
        marker = cd44,
        feature = pt_stage
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 5 x 9
  Levels     N  Mean    SD Median   IQR   Min   Max Missing
  <fct>  <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 Tis       15  60    35.7     80  50       0   100       6
2 Ta       203  54.8  33.3     60  70       0   100      53
3 T1       161  46.8  33.7     40  70       0   100      36
4 T2        21  40.8  30.1     30  40       5    90       8
5 <NA>      11  71.2  30.9     85  27.5    10   100       3

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 6.0429, df = 3, p-value = 0.1095


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

   Tis  Ta   T1  
Ta 1.00 -    -   
T1 1.00 0.26 -   
T2 1.00 0.83 1.00

P value adjustment method: bonferroni 


#### CK20

In [26]:
DF %>% 
    mutate(
        marker = ck20,
        feature = pt_stage
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 5 x 9
  Levels     N  Mean    SD Median   IQR   Min   Max Missing
  <fct>  <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 Tis       15 26.2   45.6    2.5  28.8     0   100       7
2 Ta       203 26.1   32.9    5    50       0    90      42
3 T1       161 33.0   37.5   10    70       0   100      20
4 T2        21 28.5   43.0    0    75       0   100       1
5 <NA>      11  6.67  12.1    0     7.5     0    30       5

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 2.6592, df = 3, p-value = 0.4472


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

   Tis  Ta   T1  
Ta 1.00 -    -   
T1 1.00 0.87 -   
T2 1.00 1.00 1.00

P value adjustment method: bonferroni 


#### GATA3

In [27]:
DF %>% 
    mutate(
        marker = gata3,
        feature = pt_stage
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 5 x 9
  Levels     N  Mean    SD Median   IQR   Min   Max Missing
  <fct>  <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 Tis       15 100    0       100     0   100   100       7
2 Ta       203  99.2  5.13    100     0    50   100      37
3 T1       161  98.7  5.50    100     0    60   100      22
4 T2        21 100    0       100     0   100   100       3
5 <NA>      11 100    0       100     0   100   100       4

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 3.5226, df = 3, p-value = 0.3178


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

   Tis  Ta   T1  
Ta 1.00 -    -   
T1 1.00 0.82 -   
T2 -    1.00 1.00

P value adjustment method: bonferroni 


#### ER

In [28]:
DF %>% 
    mutate(
        marker = er,
        feature = pt_stage
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 5 x 9
  Levels     N  Mean    SD Median   IQR   Min   Max Missing
  <fct>  <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 Tis       15 2.22   6.67      0     0     0    20       6
2 Ta       203 0.697  3.57      0     0     0    40      38
3 T1       161 2.61   8.95      0     0     0    60      21
4 T2        21 2.5   10         0     0     0    40       5
5 <NA>      11 2.86   3.93      0     5     0    10       4

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 3.1836, df = 3, p-value = 0.3642


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

   Tis  Ta   T1  
Ta 1.00 -    -   
T1 1.00 0.51 -   
T2 1.00 1.00 1.00

P value adjustment method: bonferroni 


#### HER2

In [29]:
DF %>% 
    mutate(
        marker = her2,
        feature = pt_stage
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 5 x 9
  Levels     N  Mean    SD Median   IQR   Min   Max Missing
  <fct>  <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 Tis       15  32.9  47.2      0  65       0   100       8
2 Ta       203  35.9  34.0     30  65       0   100      36
3 T1       161  44.3  37.1     40  75       0   100      16
4 T2        21  59.1  36.9     65  60       0   100       5
5 <NA>      11  11.4  16.5      5  17.5     0    40       4

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 8.8682, df = 3, p-value = 0.0311


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

   Tis  Ta   T1  
Ta 1.00 -    -   
T1 1.00 0.24 -   
T2 1.00 0.10 0.91

P value adjustment method: bonferroni 


#### Uroplakin

In [30]:
DF %>% 
    mutate(
        marker = uroplakin,
        feature = pt_stage
    ) %>% 
    summarize_nums(marker, feature)

# A tibble: 5 x 9
  Levels     N  Mean    SD Median   IQR   Min   Max Missing
  <fct>  <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 Tis       15 14.4   30.9    2.5 10        0    90       7
2 Ta       203 12.3   19.2    5   10        0    90      46
3 T1       161 20     30.4    5   20        0   100      34
4 T2        21 12.3   18.0    5   15        0    60       6
5 <NA>      11  6.25  10.3    2.5  6.25     0    30       3

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 2.0016, df = 3, p-value = 0.5721


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

   Tis Ta T1
Ta 1   -  - 
T1 1   1  - 
T2 1   1  1 

P value adjustment method: bonferroni 
