## Load data

In [1]:
trainval <- read.csv('train-set-values.csv')
trainlab <- read.csv('train-set-labels.csv')
train <- merge(trainlab,trainval,by='id')
head(train)

id,status_group,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,non functional,0,2012-11-13,Tasaf,0,TASAF,33.12583,-5.118154,Mratibu,...,unknown,milky,milky,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
1,functional,0,2011-03-05,Shipo,1978,SHIPO,34.77072,-9.395642,none,...,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
2,functional,0,2011-03-27,Lvia,0,LVIA,36.11506,-6.279268,Bombani,...,per bucket,soft,good,insufficient,insufficient,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
3,functional,10,2013-06-03,Germany Republi,1639,CES,37.14743,-3.187555,Area 7 Namba 5,...,per bucket,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
4,non functional,0,2011-03-22,Cmsr,0,CMSR,36.16489,-6.099289,Ezeleda,...,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,hand pump,hand pump
5,functional,50,2011-02-26,Private,28,Private,39.28612,-6.972403,Kwa Namaj,...,per bucket,soft,good,enough,enough,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe


In [2]:
summary(train)

       id                         status_group     amount_tsh      
 Min.   :    0   functional             :32259   Min.   :     0.0  
 1st Qu.:18520   functional needs repair: 4317   1st Qu.:     0.0  
 Median :37062   non functional         :22824   Median :     0.0  
 Mean   :37115                                   Mean   :   317.7  
 3rd Qu.:55657                                   3rd Qu.:    20.0  
 Max.   :74247                                   Max.   :350000.0  
                                                                   
    date_recorded                      funder        gps_height    
 2011-03-15:  572   Government Of Tanzania: 9084   Min.   : -90.0  
 2011-03-17:  558                         : 3635   1st Qu.:   0.0  
 2013-02-03:  546   Danida                : 3114   Median : 369.0  
 2011-03-14:  520   Hesawa                : 2202   Mean   : 668.3  
 2011-03-16:  513   Rwssp                 : 1374   3rd Qu.:1319.2  
 2011-03-18:  497   World Bank            : 1349

In [3]:
test <- read.csv('test-set-values.csv')

## Select predictor variables

In [4]:
library(dplyr)

"package 'dplyr' was built under R version 3.6.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



### Choose variables that have an impact on waterpoint functionality

In [5]:
train <- train[, which(names(train) %in% c('status_group','amount_tsh','gps_height','installer','basin','population','public_meeting','scheme_management','permit','construction_year','extraction_type','management','payment_type','water_quality','quantity_group','source_class','waterpoint_type'))]
test <- test[, which(names(test) %in% c('amount_tsh','gps_height','installer','basin','population','public_meeting','scheme_management','permit','construction_year','extraction_type','management','payment_type','water_quality','quantity_group','source_class','waterpoint_type'))]

### Reduce factor levels for variables with many factor levels

In [6]:
train %>% group_by(installer) %>% summarise(totals = n())
summary(train$installer)

installer,totals
,3655
-,3
0,777
A.D.B,1
AAR,4
Aartisa,1
ABASIA,29
ABD,1
ABDALA,1
Abdallah Ally Wazir,1


In [9]:
# Reword values for values that were represented in different formats
change_var <- function(table){
  table$installer <- tolower(table$installer)
  table$installer[table$installer == 'central government'] <- 'government'
  table$installer[table$installer == 'central govt'] <- 'government'
  table$installer[table$installer == 'centr'] <- 'government'
  table$installer[table$installer == 'gove'] <- 'government'
  table$installer[table$installer == 'gover'] <- 'government'
  table$installer[table$installer == 'distri'] <- 'district council'
  table$installer[table$installer == 'counc'] <- 'district council'
  table$installer[table$installer == 'council'] <- 'district council'
  table$installer[table$installer == 'district water department'] <- 'district council'
  table$installer[table$installer == 'commu'] <- 'community'
  table$installer[table$installer == 'commu'] <- 'community'
  table$installer[table$installer == 'adra /community'] <- 'adra'
  table$installer[table$installer == 'adra/ Community'] <- 'adra'
  table$installer[table$installer == 'adra/government'] <- 'adra'
  table$installer[table$installer == 'adra /government'] <- 'adra'
  table$installer[table$installer == 'adra/community'] <- 'adra'
  table$installer[table$installer == 'world vission'] <- 'world vision'
  table$installer[table$installer == 'danid'] <- 'danida'
  return(table)
}
train <- change_var(train)
test <- change_var(test)

# note: this is a non-exhaustive list. I tried to identify as many values as possible

In [10]:
# Select the top 16 values and change the rest to "others"
train$installer[train$installer %in% c(" ", "", "0", "_", "-")] <- "other"
installer_top <- names(summary(as.factor(train$installer)))[1:15]
train$installer[!(train$installer %in% installer_top)] <- "other"
table(train$installer, train$status_group)

test$installer[test$installer %in% c(" ", "", "0", "_", "-")] <- "other"
installer_top <- names(summary(as.factor(test$installer)))[1:15]
test$installer[!(test$installer %in% installer_top)] <- "other"

                  
                   functional functional needs repair non functional
  amref                   189                       6            248
  ces                     538                       1             71
  community              1113                      73            434
  danida                 1037                      95            542
  district council        502                      99            721
  dwe                    9434                    1622           6349
  government             1195                     322           2170
  hesawa                  786                      54            555
  kkkt                    425                      62            423
  lga                     105                      81            227
  other                 15904                    1634           9771
  rwe                     304                     137            765
  tcrs                    290                      42            375
  world vision 

In [11]:
# extraction_type: 'other - mkulima/shinyanga' does not exist in test set but 'other' does
train$extraction_type[train$extraction_type=='other - mkulima/shinyanga'] <- 'other'
train$extraction_type <- factor(as.character(train$extraction_type))

## Save train and test to CSV files

In [13]:
write.csv(train, "myTrainT.csv", row.names=FALSE)
write.csv(test, "myTestT.csv", row.names=FALSE)