# Preprocess metadata.tsv

In [150]:
library(data.table)
library(lubridate)
library(zoo)
library(reshape)
library(tidyverse) 

In [151]:
setwd('/Users/nashwaahmed/OneDrive - UW/git/cholera/bactopia-runs/preprocess_metadata/')

In [152]:
metadata <- read.csv('metadata_all.txt', sep = '\t', header = TRUE)

In [153]:
metadata <- metadata %>% rename(source = source..detailed.)

In [154]:
metadata <- metadata %>%
  mutate(serotype = case_when(
    serotype_phenotype == "" & serotype.phenotype == "" ~ NA,
    serotype_phenotype == "" & serotype.phenotype != "" ~ serotype.phenotype,
    serotype_phenotype != "" & serotype.phenotype == "" ~ serotype_phenotype,
    serotype_phenotype == serotype.phenotype ~ serotype_phenotype
  ))

In [155]:
metadata <- metadata %>% select(c(id, displayname, strain_id, latitude, longitude, year, serotype, source, isolation)) 

In [156]:
metadata <- metadata %>% drop_na(isolation, year)

### date

In [157]:
metadata <- metadata %>% mutate(date = paste0(as.character(year), "-XX-XX"))

### isolation source

In [158]:
clinical = c('Clinical', 'Human', 'Carrier', 'stool', 'stool (clinical index case or clinical household contact)',
            'rectal swab culture', 'Clinical (diarrhoea)','Clinical (stool)','Human (household contact)', 'Human (traveller)', 
            'Human (carriere)', 'Humana', 'Human (refugee)', 'clinical', 'blood', 'Human (feces)', 'household water source for clinical isolate',
            'rectal swab culture') 
env = c('Environmental', 'river', 'enviromental (river water)',  'River water', 'Environment', 'Sea water', 'Danube water' )
food = c('Aquatic animal (soft-shelled turtle, agricultural market)', 'Food', 'fish', 'Sea Water' )
animal = c('Soft-shelled turtle' ) 
#other = c() 

metadata <- metadata %>% mutate(isolation_source = case_when(
                                                 source %in% clinical ~ 'Clinical',  
                                                 source %in% env ~ 'Environmental',
                                                 source %in% food ~ 'Food',
                                                 source %in% animal ~ 'Animal'
                        )) 


### geolocation (UN geoscheme)

In [159]:
metadata <- metadata %>% rename(country = isolation)

In [160]:
northern_america <- c('Canada', 'USA')
southern_america <- c('Argentina', 'Bolivia', 'Brazil', 'Chile', 'Colombia', 'Ecuador', 'French Guiana', 'Guyana', 'Paraguay', 'Peru', 'Suriname', 'Uruguay', 'Venezuela')
central_america  <- c("Belize", "Costa Rica", "El Salvador", "Guatemala", "Honduras", "Mexico", "Nicaragua", "Panama")
carribbean <- c("Antigua and Barbuda", "Bahamas", "Barbados", "Cuba", "Dominica", "Dominican Republic", "Grenada", "Haiti", "Jamaica", "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines", "Trinidad and Tobago")
    
northern_africa <- c('Algeria', 'Egypt', 'Libya', 'Morocco', 'Sudan', 'Tunisia', 'Western Sahara')
eastern_africa <- c('Burundi', 'Comoros', 'Djibouti', 'Eritrea', 'Ethiopia', 'Kenya', 'Madagascar', 'Malawi', 'Mauritius', 'Mayotte', 'Mozambique', 'Reunion', 'Rwanda', 'Seychelles', 'Somalia', 'South Sudan', 'Tanzania', 'Uganda', 'Zambia', 'Zimbabwe')
central_africa <- c('Angola', 'Cameroon', 'Central African Republic', 'Chad', 'Congo', 'Democratic Republic of the Congo', 'Equatorial Guinea', 'Gabon', 'Sao Tome and Principe')
southern_africa <- c('Botswana', 'Eswatini', 'Lesotho', 'Namibia', 'South Africa')
western_africa <- c('Benin', 'Burkina Faso', 'Cape Verde', 'Gambia', 'Ghana', 'Guinea', 'Guinea Bissau', 'Ivory Coast', 'Liberia', 'Mali', 'Mauritania', 'Niger', 'Nigeria', 'Saint Helena', 'Senegal', 'Sierra Leone', 'Togo')
    
eastern_asia <- c('China', 'China Macao SAR', 'China Hong Kong SAR', 'Japan', 'Mongolia', 'North Korea', 'South Korea')
western_central_asia <- c('Kazakhstan', 'Kyrgyzstan', 'Tajikistan', 'Turkmenistan', 'Uzbekistan', 'Armenia', 'Azerbaijan', 'Bahrain', 'Cyprus', 'Georgia', 'Iraq', 'Israel', 'Jordan', 'Kuwait', 'Lebanon', 'Oman', 'Palestine', 'Qatar', 'Saudi Arabia', 'Syria', 'Turkey', 'United Arab Emirates', 'Yemen')
southeastern_asia <- c('Brunei Darussalam', 'Cambodia', 'Indonesia', 'Laos', 'Malaysia', 'Myanmar', 'Philippines', 'Singapore', 'Thailand', 'Timor Leste', 'Viet Nam', 'Vietnam')
southern_asia <- c('Afghanistan', 'Bangladesh', 'Bhutan', 'India', 'Iran', 'Maldives', 'Nepal', 'Pakistan', 'Sri Lanka')

    
eastern_europe <- c('Belarus', 'Bulgaria', 'Czechia', 'Hungary', 'Poland', 'Republic of Moldova', 'Romania', 'Russia', 'Slovakia', 'Ukraine')
northern_europe <- c('Denmark', 'Estonia', 'Faroe Islands', 'Finland', 'Iceland', 'Ireland', 'Latvia', 'Lithuania', 'Norway', 'Svalbard and Jan Mayen Islands', 'Sweden', 'United Kingdom')
southern_europe <- c('Albania', 'Andorra', 'Bosnia and Herzegovina', 'Croatia', 'Gibraltar', 'Greece', 'Italy', 'Malta', 'North Macedonia', 'Portugal', 'San Marino', 'The former state union Serbia and Montenegro', 'Slovenia', 'Spain', 'Vatican')
western_europe <- c('Austria', 'Belgium', 'France', 'Germany', 'Liechtenstein', 'Luxembourg', 'Monaco', 'Netherlands', 'Switzerland')
    
oceania <- c('American Samoa', 'Australia', 'Cook Islands', 'Fiji', 'French Polynesia', 'Guam', 'Kiribati', 'Marshall Islands', 'Micronesia', 'Nauru', 'New Caledonia', 'New Zealand', 'Niue', 'Norfolk Island', 'Northern Mariana Islands', 'Palau', 'Papua New Guinea', 'Pitcairn', 'Samoa', 'Solomon Islands', 'Tokelau', 'Tonga', 'Tuvalu')

In [161]:
metadata <- metadata %>% mutate(location = country) %>% 
             mutate(subregion = case_when(
                                location %in% northern_america ~ 'North America',
                                location %in% southern_america ~ 'South America',
                                location %in% central_america ~ 'Central America',
                                location %in% carribbean ~ 'Carribbean',
                                location %in% northern_europe ~ 'Northern Europe',
                                location %in% eastern_europe ~ 'Eastern Europe',
                                location %in% southern_europe ~ 'Southern Europe',
                                location %in% western_europe ~ 'Western Europe',
                                location %in% oceania ~ 'Oceania',
                                location %in% eastern_asia ~ 'East Asia',
                                location %in% southeastern_asia ~ 'Southeast Asia',
                                location %in% southern_asia ~ 'South Asia',
                                location %in% western_central_asia ~ 'Western/Central Asia',
                                location %in% northern_africa ~ 'North Africa', 
                                location %in% eastern_africa ~ 'East Africa',
                                location %in% central_africa ~ 'Central Africa', 
                                location %in% southern_africa ~ 'Southern Africa',
                                location %in% western_africa ~ 'West Africa'))

In [162]:
metadata <- metadata  %>% mutate(region = case_when(
                                        subregion == 'North America' ~ 'North America',
                                        subregion == 'Carribbean' ~ 'North America',
                                        subregion == 'Central America' ~ 'North America',
                                        subregion == 'South America' ~ 'South America',
    
                                        subregion == 'Western/Central Asia' ~ 'Asia',
                                        subregion == 'South Asia' ~ 'Asia',            
                                        subregion == 'South Asia' ~ 'Asia',
                                        subregion == 'Southeast Asia' ~ 'Asia',
                                        subregion == 'Western/Central Asia' ~ 'Asia',
                                        subregion == 'East Asia' ~ 'Asia',
    
                                        subregion == 'Northern Europe' ~ 'Europe',
                                        subregion == 'Eastern Europe' ~ 'Europe',
                                        subregion == 'Western Europe' ~ 'Europe',
                                        subregion == 'Southern Europe' ~ 'Europe',                

                                        subregion == 'North Africa' ~ 'Africa',
                                        subregion == 'West Africa' ~ 'Africa',
                                        subregion == 'Central Africa' ~ 'Africa',
                                        subregion == 'East Africa' ~ 'Africa',
                                        subregion == 'Southern Africa' ~ 'Africa',

                                        subregion == 'Oceania' ~ 'Oceania'))

In [163]:
metadata <- metadata %>% select(!location)

### id

In [164]:
metadata <- metadata %>% rename(name = id)

### write

In [166]:
write.table(metadata, file='/Users/nashwaahmed/OneDrive - UW/git/cholera/bactopia-runs/filter/metadata_clean.tsv', quote=FALSE, sep='\t', row.names=FALSE)

In [167]:
metadata %>% filter(displayname == '91')

name,displayname,strain_id,latitude,longitude,year,serotype,source,country,date,isolation_source,subregion,region
<chr>,<chr>,<chr>,<dbl>,<dbl>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
62ff5e6ca12ebaac0f9f907d,91,,-18.6657,35.52956,2003,,stool,Mozambique,2003-XX-XX,Clinical,East Africa,Africa
