# Cohort features

In [1]:
# Libraries
suppressPackageStartupMessages(library(tidyverse))

# Global
options(warn = -1)

# Scripts
source("summarize.R")

# Data
load("DF.Rdata")

glimpse(DF)

Observations: 411
Variables: 23
$ registry_id            <dbl> 20060035601, 19970063502, 20040008703, 2007008…
$ pt_stage               <fct> T1, T1, T2, T1, T1, T1, Tis, T1, T2, Ta, Ta, T…
$ sp_id                  <chr> "06-S-4802", "07-S-3788", "07-S-8931", "07-S-6…
$ ck56                   <dbl> 60, 10, 90, NA, NA, 5, NA, 0, 0, 40, NA, 40, 4…
$ ck20                   <dbl> 0, 0, 0, NA, 0, 70, NA, 80, 0, 0, NA, 0, 10, N…
$ cd44                   <dbl> 70, 60, NA, NA, NA, 40, NA, 5, NA, 60, NA, NA,…
$ gata3                  <dbl> 100, 100, 100, NA, 100, 100, NA, 100, 100, 100…
$ er                     <dbl> 40, 20, NA, NA, 0, 0, 0, 0, 0, 5, NA, NA, 0, N…
$ her2                   <dbl> 0, 5, NA, NA, 60, 30, NA, 40, 60, 60, NA, 30, …
$ uroplakin              <dbl> 5, 20, NA, NA, NA, 0, NA, 10, NA, 0, NA, NA, 0…
$ sex                    <fct> Male, Male, Male, Male, Male, Male, Female, Ma…
$ death                  <fct> Dead, Alive, Dead, Alive, Alive, Alive, Alive,…
$ progression_stage_

## Clinical and outcome features
Clinical and outcome features are analyzed at the patient level.

In [2]:
# Tidying up the clinical data
CLINICAL <- DF %>% 
    group_by(registry_id) %>% 
    select(
        registry_id,
        age_dx,
        sex,
        fu_mo,
        recurrence_any,
        progression_stage_any,
        progression_grade_any,
        death
    ) %>% 
    distinct() %>%
    ungroup()

glimpse(CLINICAL)

Observations: 60
Variables: 8
$ registry_id           <dbl> 20060035601, 19970063502, 20040008703, 20070089…
$ age_dx                <dbl> 77, 71, 89, 59, 76, 68, 59, 71, 60, 56, 78, 89,…
$ sex                   <fct> Male, Male, Male, Male, Male, Male, Female, Mal…
$ fu_mo                 <dbl> 42.6, 36.0, 3.9, 43.5, 39.1, 16.3, 66.6, 51.5, …
$ recurrence_any        <fct> No tumor recurrence, Tumor recurrence, Tumor re…
$ progression_stage_any <fct> No stage progression, Stage progression, Stage …
$ progression_grade_any <fct> No grade progression, No grade progression, No …
$ death                 <fct> Dead, Alive, Dead, Alive, Alive, Alive, Alive, …


### Age, in years

In [3]:
CLINICAL %>% summarize_num(age_dx)

# A tibble: 1 x 8
      N  Mean    SD Median   IQR   Min   Max Missing
  <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1    60  68.0  9.86     68  13.5    47    89       0


### Sex

In [4]:
CLINICAL %>% summarize_fct(sex)

# A tibble: 2 x 3
  Levels     N  Freq
  <fct>  <int> <dbl>
1 Female    19  31.7
2 Male      41  68.3


### Follow-up, in months

In [5]:
CLINICAL %>% summarize_num(fu_mo)

# A tibble: 1 x 8
      N  Mean    SD Median   IQR   Min   Max Missing
  <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1    60  42.7  46.9   39.4  35.7   2.1  275.       0


### Tumor recurrence at any biopsy

In [6]:
CLINICAL %>% summarize_fct(recurrence_any)

# A tibble: 2 x 3
  Levels                  N  Freq
  <fct>               <int> <dbl>
1 Tumor recurrence       52  86.7
2 No tumor recurrence     8  13.3


### Tumor grade progression at any biopsy

In [7]:
CLINICAL %>% summarize_fct(progression_grade_any)

# A tibble: 2 x 3
  Levels                   N  Freq
  <fct>                <int> <dbl>
1 Grade progression        5   8.3
2 No grade progression    55  91.7


### Tumor stage progression at any biopsy

In [8]:
CLINICAL %>% summarize_fct(progression_stage_any)

# A tibble: 2 x 3
  Levels                   N  Freq
  <fct>                <int> <dbl>
1 Stage progression        6    10
2 No stage progression    54    90


### Overall mortality

In [9]:
CLINICAL %>% summarize_fct(death)

# A tibble: 3 x 3
  Levels     N  Freq
  <fct>  <int> <dbl>
1 Alive     48    80
2 Dead       6    10
3 <NA>       6    10


## Pathologic features
This section includes the pathologic features of the cases that were included in the dataset. For the histologic diagnosis, "CIS" includes carcinoma in situ and dysplasia, "LG" and "HG" mean low-grade and high-grade noninvasive papillary urothelial carcinoma, respectively.

In [10]:
# Tidying up the pathologic data
PATHOLOGIC <- DF %>% 
    group_by(sp_id) %>% 
    select(
        sp_id,
        histo_dx,
        pt_stage,
        recurrence_next,
        progression_grade_next,
        progression_stage_next
    ) %>% 
    distinct() %>% 
    ungroup()

glimpse (PATHOLOGIC)

Observations: 193
Variables: 6
$ sp_id                  <chr> "06-S-4802", "07-S-3788", "07-S-8931", "07-S-6…
$ histo_dx               <fct> HG, HG, HG, HG, HG, HG, CIS, HG, Invasive, HG,…
$ pt_stage               <fct> T1, T1, T2, T1, T1, T1, Tis, T1, T2, Ta, Ta, T…
$ recurrence_next        <fct> No tumor recurrence, Tumor recurrence, No tumo…
$ progression_grade_next <fct> No grade progression, No grade progression, No…
$ progression_stage_next <fct> No stage progression, No stage progression, No…


### Histologic diagnosis

In [11]:
PATHOLOGIC %>% summarize_fct(histo_dx)

# A tibble: 4 x 3
  Levels       N  Freq
  <fct>    <int> <dbl>
1 CIS         13   6.7
2 LG          60  31.1
3 HG          79  40.9
4 Invasive    41  21.2


### pT stage

In [12]:
PATHOLOGIC %>% summarize_fct(pt_stage)

# A tibble: 5 x 3
  Levels     N  Freq
  <fct>  <int> <dbl>
1 Tis        9   4.7
2 Ta       102  52.8
3 T1        66  34.2
4 T2        10   5.2
5 <NA>       6   3.1


### Tumor recurrence at next biopsy

In [13]:
PATHOLOGIC %>% summarize_fct(recurrence_next)

# A tibble: 3 x 3
  Levels                  N  Freq
  <fct>               <int> <dbl>
1 Tumor recurrence      102  52.8
2 No tumor recurrence    68  35.2
3 <NA>                   23  11.9


### Tumor grade progression at next biopsy

In [14]:
PATHOLOGIC %>% summarize_fct(progression_grade_next)

# A tibble: 3 x 3
  Levels                   N  Freq
  <fct>                <int> <dbl>
1 Grade progression        6   3.1
2 No grade progression   170  88.1
3 <NA>                    17   8.8


### Tumor stage progression at next biopsy

In [15]:
PATHOLOGIC %>% summarize_fct(progression_stage_next)

# A tibble: 3 x 3
  Levels                   N  Freq
  <fct>                <int> <dbl>
1 Stage progression        9   4.7
2 No stage progression   160  82.9
3 <NA>                    24  12.4


## Biomarkers features
Biomarkers features were established at the TMA level.

In [16]:
DF %>% 
    select(ck56:uroplakin) %>% 
    gather(key = "Biomarker", value = "Expression") %>% 
    summarize_nums(Expression, Biomarker)

# A tibble: 7 x 9
  Levels        N  Mean    SD Median   IQR   Min   Max Missing
  <fct>     <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <int>
1 cd44        411 51.5  33.6      60  60       0   100     106
2 ck20        411 28.8  35.7       5  62.5     0   100      75
3 ck56        411 28.1  29.2      20  35       0   100      76
4 er          411  1.66  6.79      0   0       0    60      74
5 gata3       411 99.1   5.05    100   0      50   100      73
6 her2        411 40    36.0      30  65       0   100      69
7 uroplakin   411 15.3  24.6       5  20       0   100      96

	Kruskal-Wallis rank sum test

data:  x by y
Kruskal-Wallis chi-squared = 1260, df = 6, p-value < 2.2e-16


	Pairwise comparisons using Wilcoxon rank sum test 

data:  x and y 

          cd44    ck20    ck56    er      gata3   her2   
ck20      < 2e-16 -       -       -       -       -      
ck56      2.9e-16 0.01987 -       -       -       -      
er        < 2e-16 < 2e-16 < 2e-16 -       -       -      
gata