In [1]:
# import statements and functions to clean columns

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime as dt
import tqdm
import folium
from collections import Counter
%matplotlib inline
pd.set_option('display.max_columns', None)

In [1]:
drop_cols = ['wpt_name', 'recorded_by']
columns_to_keep = ['id',
                   'date_recorded',
                   'amount_tsh', 
                   'gps_height', 
                   'latitude', 
                   'longitude', 
                   'basin',
                   'lga',
                   'population',
                   'permit',
                   'extraction_type',
                   'management',
                   'payment',
                   'water_quality',
                   'quantity',
                   'source',
                   'waterpoint_type']

numeric_columns = ['date_recorded', 
                   'amount_tsh', 
                   'gps_height', 
                   'population', 
                   'permit', 
                   'payment', 
                   'water_quality', 
                   'quantity', 'source', 
                   'status_group']

statuses = {'functional':1, 'functional needs repair':2, 'non functional':3}

status_decoder = {1:'functional', 2:'functional needs repair', 3:'non functional'}

def encode_quality(x):
    """
    Interprets water_quality as "good" (1) or "not known to be good" (0)
    """
    if x == 'soft':
        return 1
    else:
        return 0

def encode_tsh(x):
    """
    Classifies pumps into three groups based on tsh, 
    which is a measure of pump power.
    """
    if x > 250:
        return 2
    elif x > 0:
        return 1
    else:
        return 0
    
def encode_date_recorded(date):
    """
    Classifies pumps into one of the five phases.
    """
    if date < dt.date(month=6, day=1, year=2011):
        return 1
    elif date < dt.date(month=2, day=1, year=2012):
        return 2
    elif date < dt.date(month=1, day=1, year=2013):
        return 3
    elif date < dt.date(month=4, day=10, year=2013):
        return 4
    elif date >= dt.date(month=4, day=10, year=2013):
        return 5
    else:
        # just in case, but there are no missing dates in the data
        return 0
    
def encode_quantity(quantity):
    """
    Classifies quantity into three classes:
     2 - "enough" - this pump provides enough water for the people who use is
     1 - "insufficient" - this pump provides some water, but not enough
     0 - "other" - the pump is dry, or only functional for part of the year
    """
    if quantity == 'enough':
        return 2
    if quantity == 'insufficient':
        return 1
    else:
        return 0
    
def encode_payment(payment):
    """
    classify payments into five groups, based on how often the
    community pays for pumps. payment is an indicator of resources
    available for maintenance (and also an indicator of whether or not
    the pump actually works)
    """
    if payment == 'never pay':
        return 0
    elif payment == 'pay annually':
        return 1
    elif payment == 'pay monthly':
        return 2
    elif payment == 'pay per bucket':
        return 3
    else:
        return -1
    
def encode_source(source):
    """
    classify based on where the water comes from. the values are
    a ranking from "lowest altitude" to "highest altitude"
    """
    if source == 'hand dtw':
        return -3
    elif source == 'shallow well':
        return -2
    elif source == 'spring':
        return -1
    elif source == 'rainwater harvesting':
        return 1
    else:
        return 0
    
def encode_extraction_type(x):
    """
    classify extraction type based on what powers the pump.
    """
    if x in ['nira/tanira', 'swn 80', 'india mark ii', 'india mark iii', 'afridev', 
             'other - rope pump', 'swn 81', 'other - play pump', 'walimi']:
        return 'human'

    elif x in  ['submersible', 'mono', 'ksb','cemo','climax', 'other - mkulima/shinyanga']:
        return 'power'

    elif x in ['windmill', 'gravity']:
        return 'nature'
    
    else:
        return 'unknown'

In [2]:
# function to load and clean data

def load_data(separate=False):
    """
    loads and cleans the data.
    
    separate parameter returns separate target and predictors, if True
    """
    X = pd.read_csv('training_values.csv')
    y = pd.read_csv('training_labels.csv')
    
    # encode statuses
    #y['status'] = y.status_group.apply(lambda x: statuses[x])
    
    # encode tsh
    X.amount_tsh = X.amount_tsh.apply(encode_tsh)
    
    # modify gps_height to one-hot
    X.gps_height = X.gps_height.apply(lambda x: int(x != 0))
    
    # convert date_recorded to datetime 'phase'
    X.date_recorded = pd.to_datetime(X.date_recorded, errors='coerce')
    X.date_recorded = X.date_recorded.apply(encode_date_recorded)
    
    # set known permits equal to 1, all else equal to 1
    X.permit = X.permit.apply(lambda x: 1 if x else 0)
    
    # set water quality to 1 for 'known to be good' and 0 otherwise
    X.water_quality = X.water_quality.apply(lambda x: int(x == 'soft'))
    
    # encode quantity: 2 for 'enough', 1 for 'insufficient', 0 for everything else
    X.quantity = X.quantity.apply(encode_quantity)
    
    # encode payment in terms of 'number of payments per year' (or -1 if unknown)
    X.payment = X.payment.apply(encode_payment)
    
    # encode source (negative: underground. 0: surface. positive: above ground)
    X.source = X.source.apply(encode_source)
    
    # encode extraction type: hand, powered, or nature
    X.extraction_type = X.extraction_type.apply(encode_extraction_type)
    X = X[columns_to_keep]
    
    if separate:
        return X, y
    
    else:
        return pd.merge(X, y, left_on='id', right_on='id')

load_data(separate=False).head(2)

Unnamed: 0,id,date_recorded,amount_tsh,gps_height,latitude,longitude,basin,lga,population,permit,extraction_type,management,payment,water_quality,quantity,source,waterpoint_type,status_group
0,69572,1,2,1,-9.856322,34.938093,Lake Nyasa,Ludewa,109,0,nature,vwc,1,1,2,-1,communal standpipe,functional
1,8776,4,0,1,-2.147466,34.698766,Lake Victoria,Serengeti,280,1,nature,wug,0,1,1,1,communal standpipe,functional
