In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
# from xgboost import XGBClassifier
RANDOM_STATE = 42

First I imported the data and investigated it. I found that I could merge my target and variables using the id column.

In [2]:
df = pd.read_csv('../Data/Training_set_values.csv')
target = pd.read_csv('../Data/Training_set_target.csv')

In [3]:
df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [4]:
target.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [5]:
df, X_hold, target, y_hold = train_test_split(df, target, test_size=.1,
                                                    random_state=RANDOM_STATE) 
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=.2, random_state=RANDOM_STATE)

In [6]:
cleaning_df = X_train.merge(y_train)
test_df = X_test.merge(y_test)

In [7]:
cleaning_df.status_group.value_counts()

functional                 23312
non functional             16382
functional needs repair     3074
Name: status_group, dtype: int64

In [8]:
cleaning_df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,45513,0.0,2012-11-03,Hesawa,0,DWE,0.0,-2e-08,none,0,...,soft,good,dry,dry,shallow well,shallow well,groundwater,hand pump,hand pump,non functional
1,11765,0.0,2012-10-22,World Bank,0,JUIN CO,33.52349,-3.438365,Msikitini,0,...,unknown,unknown,unknown,unknown,machine dbh,borehole,groundwater,communal standpipe,communal standpipe,non functional
2,50654,50.0,2011-03-17,Idc,673,DWE,35.771341,-7.159651,none,0,...,salty,salty,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
3,24660,0.0,2011-07-13,He,0,HE,31.613434,-1.816216,Kwa Yazid S,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,non functional
4,6705,500.0,2013-02-11,Government Of Tanzania,1272,Government,30.132212,-4.242582,Kwa Kenyene,0,...,soft,good,enough,enough,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional


I looked through all of the value counts for every column to find out which columns have data I am going to want to use. For instance: medium variety, low null value count, checking for null replacement values etc.

In [9]:
results = []
for column in cleaning_df.columns.to_list():
    results.append({column : cleaning_df[column].value_counts()})
results

[{'id': 0        1
  64202    1
  56014    1
  53967    1
  8052     1
          ..
  65288    1
  71080    1
  7593     1
  1450     1
  67583    1
  Name: id, Length: 42768, dtype: int64},
 {'amount_tsh': 0.0         29925
  500.0        2259
  50.0         1799
  20.0         1086
  1000.0       1049
              ...  
  53.0            1
  26.0            1
  70000.0         1
  250000.0        1
  220.0           1
  Name: amount_tsh, Length: 90, dtype: int64},
 {'date_recorded': 2011-03-17    417
  2013-02-03    399
  2011-03-15    392
  2011-03-14    381
  2011-03-16    359
               ... 
  2011-09-15      1
  2004-03-01      1
  2011-09-11      1
  2013-01-01      1
  2011-09-09      1
  Name: date_recorded, Length: 341, dtype: int64},
 {'funder': Government Of Tanzania          6508
  Danida                          2266
  Hesawa                          1610
  Rwssp                            989
  World Bank                       970
                                  .

Using the info I found digging through these results I found which columns needed to be dropped altogether and which needed some cleaning. So I dropped the columns, replaced some null value replacement values with actual null values and dropped them all. Then I replaced the rest of the replacement values with the mean. 

In [10]:
to_drop = ['scheme_name', 'amount_tsh', 'num_private', 'recorded_by', 'extraction_type', 'payment',
           'quality_group','quantity_group', 'region', 'waterpoint_type_group']
for df in [cleaning_df, test_df]:
    df.drop(to_drop, axis=1, inplace=True)
    df['longitude'].replace({0 : np.NaN}, inplace=True)
    df['wpt_name'].replace({'none' : np.NaN}, inplace=True)
    df['scheme_management'].replace({'None' : np.NaN}, inplace=True)
    df.dropna(inplace=True)

In [11]:
replace_columns = ['construction_year', 'population']
for column in replace_columns:
    this_mean = round(cleaning_df[column][cleaning_df[column] != 0].mean())
    cleaning_df[column].replace({0 : this_mean}, inplace=True)
    test_df[column].replace({0 : this_mean}, inplace=True)

I next changed my target to intergers in order to run a classification model against the data. I also combined 'functional needs repair' and 'functional' so it is binary.

In [12]:
needs_repair_ids_train = cleaning_df['id'][cleaning_df['status_group'] == 'functional needs repair']
needs_repair_ids_test = test_df['id'][test_df['status_group'] == 'functional needs repair']
cleaning_df['status_group'].replace({'functional needs repair': 'functional'}, inplace=True)
test_df['status_group'].replace({'functional needs repair': 'functional'}, inplace=True)
target_train = cleaning_df['status_group']
target_test = test_df['status_group']
preprocessed_df_train = cleaning_df.drop('status_group', axis=1)
preprocessed_df_test = test_df.drop('status_group', axis=1)

In [13]:
for target in [target_train, target_test]:
    target.replace({'functional' : 1 , 'non functional' : 0}, inplace=True)

I split my variables into categorical and continuous so I am able to then One Hot Encode my categorical data and concat my continuous back onto for my processed_df. I then save all my individual datasets in order to access them in other notebooks.

In [14]:
categorical = ['basin', 'public_meeting', 'scheme_management', 'permit',
               'extraction_type_group', 'extraction_type_class', 'management',
               'management_group', 'payment_type', 'water_quality', 'quantity',
               'source_type', 'source_class', 'waterpoint_type']
continuous = ['id', 'gps_height', 'longitude', 'latitude', 'region_code', 'district_code', 'population', 'construction_year']

In [15]:
categorical_df_train = preprocessed_df_train[categorical]
continuous_df_train = preprocessed_df_train[continuous]
dummy_df_train = pd.get_dummies(categorical_df_train)
processed_df_train = pd.concat([continuous_df_train, dummy_df_train], axis=1)
categorical_df_test = preprocessed_df_test[categorical]
continuous_df_test = preprocessed_df_test[continuous]
dummy_df_test = pd.get_dummies(categorical_df_test)
processed_df_test = pd.concat([continuous_df_test, dummy_df_test], axis=1)

In [16]:
processed_df_train.to_csv('../Data/processed_varibles_train.csv', index=False)
target_train.to_csv('../Data/processed_target_train.csv', index=False)
needs_repair_ids_train.to_csv('../Data/needs_repair_ids_train.csv', index=False)
processed_df_test.to_csv('../Data/processed_varibles_test.csv', index=False)
target_test.to_csv('../Data/processed_target_test.csv', index=False)
needs_repair_ids_test.to_csv('../Data/needs_repair_ids_test.csv', index=False)
X_hold.to_csv('../Data/varibles_hold.csv', index=False)
y_hold.to_csv('../Data/target_hold.csv', index=False)